Gemini models can process videos, enabling many frontier developer use cases that would have historically required domain specific models. Some of Gemini's vision capabilities include the ability to: describe, segment, and extract information from videos, answer questions about video content, and refer to specific timestamps within a video.
You can provide videos as input to Gemini in the following ways:
The following code downloads a sample video, uploads it using the Files API, waits for it to be processed, and then uses the uploaded file reference to summarize the video.
fromgoogleimport genai
client = genai.Client()
myfile = client.files.upload(file="path/to/sample.mp4")
response = client.models.generate_content(
model="gemini-2.5-flash", contents=[myfile, "Summarize this video. Then create a quiz with an answer key based on the information in this video."]
)
print(response.text)
import{
GoogleGenAI,
createUserContent,
createPartFromUri,
}from"@google/genai";
constai=newGoogleGenAI({});
asyncfunctionmain(){
constmyfile=awaitai.files.upload({
file:"path/to/sample.mp4",
config:{mimeType:"video/mp4"},
});
constresponse=awaitai.models.generateContent({
model:"gemini-2.5-flash",
contents:createUserContent([
createPartFromUri(myfile.uri,myfile.mimeType),
"Summarize this video. Then create a quiz with an answer key based on the information in this video.",
]),
});
console.log(response.text);
}
awaitmain();
uploadedFile,_:=client.Files.UploadFromPath(ctx,"path/to/sample.mp4",nil)
parts:=[]*genai.Part{
genai.NewPartFromText("Summarize this video. Then create a quiz with an answer key based on the information in this video."),
genai.NewPartFromURI(uploadedFile.URI,uploadedFile.MIMEType),
}
contents:=[]*genai.Content{
genai.NewContentFromParts(parts,genai.RoleUser),
}
result,_:=client.Models.GenerateContent(
ctx,
"gemini-2.5-flash",
contents,
nil,
)
fmt.Println(result.Text())
VIDEO_PATH="path/to/sample.mp4"
MIME_TYPE=$(file-b--mime-type"${VIDEO_PATH}")
NUM_BYTES=$(wc-c < "${VIDEO_PATH}")
DISPLAY_NAME=VIDEO
tmp_header_file=upload-header.tmp
echo"Starting file upload..."
curl"https://generativelanguage.googleapis.com/upload/v1beta/files"\
-H"x-goog-api-key: $GEMINI_API_KEY"\
-D${tmp_header_file}\
-H"X-Goog-Upload-Protocol: resumable"\
-H"X-Goog-Upload-Command: start"\
-H"X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}"\
-H"X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}"\
-H"Content-Type: application/json"\
-d"{'file': {'display_name': '${DISPLAY_NAME}'}}"2>/dev/null
upload_url=$(grep-i"x-goog-upload-url: ""${tmp_header_file}"|cut-d" "-f2|tr-d"\r")
rm"${tmp_header_file}"
echo"Uploading video data..."
curl"${upload_url}"\
-H"Content-Length: ${NUM_BYTES}"\
-H"X-Goog-Upload-Offset: 0"\
-H"X-Goog-Upload-Command: upload, finalize"\
--data-binary"@${VIDEO_PATH}"2>/dev/null > file_info.json
file_uri=$(jq-r".file.uri"file_info.json)
echofile_uri=$file_uri
echo"File uploaded successfully. File URI: ${file_uri}"
# --- 3. Generate content using the uploaded video file ---
echo"Generating content from video..."
curl"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"\
-H"x-goog-api-key: $GEMINI_API_KEY"\
-H'Content-Type: application/json'\
-XPOST\
-d'{
"contents": [{
"parts":[
{"file_data":{"mime_type": "'"${MIME_TYPE}"'", "file_uri": "'"${file_uri}"'"}},
{"text": "Summarize this video. Then create a quiz with an answer key based on the information in this video."}]
}]
}'2>/dev/null > response.json
jq-r".candidates[].content.parts[].text"response.json
Always use the Files API when the total request size (including the file, text prompt, system instructions, etc.) is larger than 20 MB, the video duration is significant, or if you intend to use the same video in multiple prompts. The File API accepts video file formats directly.
To learn more about working with media files, see Files API.
Instead of uploading a video file using the File API, you can pass smaller
videos directly in the request to generateContent. This is suitable for
shorter videos under 20MB total request size.
Here's an example of providing inline video data:
fromgoogleimport genai
fromgoogle.genaiimport types
# Only for videos of size <20Mb
video_file_name = "/path/to/your/video.mp4"
video_bytes = open(video_file_name, 'rb').read()
client = genai.Client()
response = client.models.generate_content(
model='models/gemini-2.5-flash',
contents=types.Content(
parts=[
types.Part(
inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')
),
types.Part(text='Please summarize the video in 3 sentences.')
]
)
)
print(response.text)
import{GoogleGenAI}from"@google/genai";
import*asfsfrom"node:fs";
constai=newGoogleGenAI({});
constbase64VideoFile=fs.readFileSync("path/to/small-sample.mp4",{
encoding:"base64",
});
constcontents=[
{
inlineData:{
mimeType:"video/mp4",
data:base64VideoFile,
},
},
{text:"Please summarize the video in 3 sentences."}
];
constresponse=awaitai.models.generateContent({
model:"gemini-2.5-flash",
contents:contents,
});
console.log(response.text);
VIDEO_PATH=/path/to/your/video.mp4
if[["$(base64--version2>&1)"=*"FreeBSD"*]];then
B64FLAGS="--input"
else
B64FLAGS="-w0"
fi
curl"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"\
-H"x-goog-api-key: $GEMINI_API_KEY"\
-H'Content-Type: application/json'\
-XPOST\
-d'{
"contents": [{
"parts":[
{
"inline_data": {
"mime_type":"video/mp4",
"data": "'$(base64$B64FLAGS$VIDEO_PATH)'"
}
},
{"text": "Please summarize the video in 3 sentences."}
]
}]
}'2>/dev/null
You can pass YouTube URLs directly to Gemini API as part of your request as follows:
fromgoogleimport genai
fromgoogle.genaiimport types
client = genai.Client()
response = client.models.generate_content(
model='models/gemini-2.5-flash',
contents=types.Content(
parts=[
types.Part(
file_data=types.FileData(file_uri='https://www.youtube.com/watch?v=9hE5-98ZeCg')
),
types.Part(text='Please summarize the video in 3 sentences.')
]
)
)
print(response.text)
import{GoogleGenAI}from"@google/genai";
constai=newGoogleGenAI({});
constcontents=[
{
fileData:{
fileUri:"https://www.youtube.com/watch?v=9hE5-98ZeCg",
},
},
{text:"Please summarize the video in 3 sentences."}
];
constresponse=awaitai.models.generateContent({
model:"gemini-2.5-flash",
contents:contents,
});
console.log(response.text);
packagemain
import(
"context"
"fmt"
"os"
"google.golang.org/genai"
)
funcmain(){
ctx:=context.Background()
client,err:=genai.NewClient(ctx,nil)
iferr!=nil{
log.Fatal(err)
}
parts:=[]*genai.Part{
genai.NewPartFromText("Please summarize the video in 3 sentences."),
genai.NewPartFromURI("https://www.youtube.com/watch?v=9hE5-98ZeCg","video/mp4"),
}
contents:=[]*genai.Content{
genai.NewContentFromParts(parts,genai.RoleUser),
}
result,_:=client.Models.GenerateContent(
ctx,
"gemini-2.5-flash",
contents,
nil,
)
fmt.Println(result.Text())
}
curl"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"\
-H"x-goog-api-key: $GEMINI_API_KEY"\
-H'Content-Type: application/json'\
-XPOST\
-d'{
"contents": [{
"parts":[
{"text": "Please summarize the video in 3 sentences."},
{
"file_data": {
"file_uri": "https://www.youtube.com/watch?v=9hE5-98ZeCg"
}
}
]
}]
}'2>/dev/null
Limitations:
You can ask questions about specific points in time within the video using
timestamps of the form MM:SS.
prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" # Adjusted timestamps for the NASA video
constprompt="What are the examples given at 00:05 and 00:10 supposed to show us?";
prompt:=[]*genai.Part{
genai.NewPartFromURI(currentVideoFile.URI,currentVideoFile.MIMEType),
// Adjusted timestamps for the NASA video
genai.NewPartFromText("What are the examples given at 00:05 and "+
"00:10 supposed to show us?"),
}
PROMPT="What are the examples given at 00:05 and 00:10 supposed to show us?"
Gemini models offer powerful capabilities for understanding video content by processing information from both the audio and visual streams. This lets you extract a rich set of details, including generating descriptions of what is happening in a video and answering questions about its content. For visual descriptions, the model samples the video at a rate of 1 frame per second. This sampling rate may affect the level of detail in the descriptions, particularly for videos with rapidly changing visuals.
prompt = "Describe the key events in this video, providing both audio and visual details. Include timestamps for salient moments."
constprompt="Describe the key events in this video, providing both audio and visual details. Include timestamps for salient moments.";
prompt:=[]*genai.Part{
genai.NewPartFromURI(currentVideoFile.URI,currentVideoFile.MIMEType),
genai.NewPartFromText("Describe the key events in this video, providing both audio and visual details. "+
"Include timestamps for salient moments."),
}
PROMPT="Describe the key events in this video, providing both audio and visual details. Include timestamps for salient moments."
You can customize video processing in the Gemini API by setting clipping intervals or providing custom frame rate sampling.
You can clip video by specifying videoMetadata with start and end offsets.
fromgoogleimport genai
fromgoogle.genaiimport types
client = genai.Client()
response = client.models.generate_content(
model='models/gemini-2.5-flash',
contents=types.Content(
parts=[
types.Part(
file_data=types.FileData(file_uri='https://www.youtube.com/watch?v=XEzRZ35urlk'),
video_metadata=types.VideoMetadata(
start_offset='1250s',
end_offset='1570s'
)
),
types.Part(text='Please summarize the video in 3 sentences.')
]
)
)
import{GoogleGenAI}from'@google/genai';
constai=newGoogleGenAI({});
constmodel='gemini-2.5-flash';
asyncfunctionmain(){
constcontents=[
{
role:'user',
parts:[
{
fileData:{
fileUri:'https://www.youtube.com/watch?v=9hE5-98ZeCg',
mimeType:'video/*',
},
videoMetadata:{
startOffset:'40s',
endOffset:'80s',
}
},
{
text:'Please summarize the video in 3 sentences.',
},
],
},
];
constresponse=awaitai.models.generateContent({
model,
contents,
});
console.log(response.text)
}
awaitmain();
You can set custom frame rate sampling by passing an fps argument to
videoMetadata.
fromgoogleimport genai
fromgoogle.genaiimport types
# Only for videos of size <20Mb
video_file_name = "/path/to/your/video.mp4"
video_bytes = open(video_file_name, 'rb').read()
client = genai.Client()
response = client.models.generate_content(
model='models/gemini-2.5-flash',
contents=types.Content(
parts=[
types.Part(
inline_data=types.Blob(
data=video_bytes,
mime_type='video/mp4'),
video_metadata=types.VideoMetadata(fps=5)
),
types.Part(text='Please summarize the video in 3 sentences.')
]
)
)
By default 1 frame per second (FPS) is sampled from the video. You might want to set low FPS (< 1) for long videos. This is especially useful for mostly static videos (e.g. lectures). Use a higher FPS for videos requiring granular temporal analysis, such as fast-action understanding or high-speed motion tracking.
Gemini supports the following video format MIME types:
video/mp4video/mpegvideo/movvideo/avivideo/x-flvvideo/mpgvideo/webmvideo/wmvvideo/3gppmediaResolution is set
to low, frames are tokenized at 66 tokens per frame.Medial resolution: Gemini 3 introduces granular control over multimodal
vision processing with the media_resolution parameter. The
media_resolution parameter determines the
maximum number of tokens allocated per input image or video frame.
Higher resolutions improve the model's ability to read fine text or identify
small details, but increase token usage and latency.
For more details about the parameter and how it can impact token calculations, see the media resolution guide.
Timestamp format: When referring to specific moments in a video within your prompt, use the MM:SS format (e.g., 01:15 for 1 minute and 15 seconds).
Best practices:
contents array.This guide shows how to upload video files and generate text outputs from video inputs. To learn more, see the following resources: