By combining multimodal image input with structured output, Genkit can provide detailed analysis of images and even provide rough bounding boxes for the images.
This example takes an arbitrary image and analyzes it to identify objects and draw a colored bounding box around them.
Note: When Gemini "sees" an image it normalizes it into a 1000x1000 square. So when extracting bounding boxes, you'll need to adjust the results to suit the actual dimensions of the uploaded image.
Identify ONLY plants in the provided image.
then try it with the bluebird picture.// api/route.ts
import { genkit, z } from "genkit";
import { googleAI, gemini20Flash } from "@genkit-ai/googleai";
const ai = genkit({
plugins: [googleAI()], // set the GOOGLE_API_KEY env variable
model: gemini20Flash,
});
import { simpleEndpoint } from "@/lib/genkit-endpoint";
import type { Part } from "genkit";
import { ImageObjectSchema } from "../schema";
interface Input {
system: Part[]; // default: "Identify the objects in the provided image."
imageUrl: string; // base64-encoded data uri
}
export const POST = simpleEndpoint<Input>(async ({ system, imageUrl }) => {
const { output } = await ai.generate({
system, // default: "Identify all of the ojects in the provided image."
prompt: [{ media: { url: imageUrl } }], // base64-encoded data uri
output: {
schema: z.object({
objects: z
.array(ImageObjectSchema)
.describe("list of objects in the image"),
}),
},
});
return output;
});
// schema.ts
import { z } from "genkit";
export const ImageObjectSchema = z.object({
name: z.string().describe("a short but unique name of the object"),
description: z
.string()
.describe("a single sentence detailed description of the object"),
text: z.string().describe("any written text on the object").nullish(),
colors: z
.array(z.string())
.describe(
"a list of one or more valid CSS named colors that make up the object, from most to least prevalent"
),
box2d: z
.array(z.number())
.describe("bounding box for the object in [y1,x1,y2,x2] format"),
});
export type ImageObject = z.infer<typeof ImageObjectSchema>;