vercel · timconnorz · Dec 11, 2024 · Dec 12, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/.changeset/poor-apples-punch.md b/.changeset/poor-apples-punch.md
@@ -0,0 +1,5 @@
+---
+'@ai-sdk/google': patch
+---
+
+feat: adding audioTimestamp support to GoogleGenerativeAISettings
diff --git a/content/providers/01-ai-sdk-providers/11-google-vertex.mdx b/content/providers/01-ai-sdk-providers/11-google-vertex.mdx
@@ -294,6 +294,13 @@ The following optional settings are available for Google Vertex models:
 
   Optional. When enabled, the model will [use Google search to ground the response](https://cloud.google.com/vertex-ai/generative-ai/docs/grounding/overview).
 
+- **audioTimestamp** _boolean_
+
+  Optional. Enables timestamp understanding for audio files. Defaults to false.
+
+  This is useful for generating transcripts with accurate timestamps.
+  Only available for `gemini-1.5-pro-002` and `gemini-1.5-flash-002`.
+
 You can use Google Vertex language models to generate text with the `generateText` function:
 
 ```ts highlight="1,4"

diff --git a/examples/ai-core/src/e2e/google-vertex.test.ts b/examples/ai-core/src/e2e/google-vertex.test.ts
@@ -392,6 +392,25 @@ describe.each(Object.values(RUNTIME_VARIANTS))(
         expect(result.text.toLowerCase()).toContain('cat');
         expect(result.usage?.totalTokens).toBeGreaterThan(0);
       });
+
+      it('should generate text from audio input', { timeout: LONG_TEST_MILLIS }, async () => {
+        const model = vertex(modelId);
+        const result = await generateText({
+          model,
+          messages: [
+            {
+              role: 'user',
+              content: [
+                { type: 'text', text: 'Output a transcript of spoken words. Break up transcript lines when there are pauses. Include timestamps in the format of HH:MM:SS.SSS.' },
+                { type: 'file', data: Buffer.from(fs.readFileSync('./data/galileo.mp3')), mimeType: 'audio/mpeg' },
+              ],
+            },
+          ],
+        });
+        expect(result.text).toBeTruthy();
+        expect(result.text.toLowerCase()).toContain('galileo');
+        expect(result.usage?.totalTokens).toBeGreaterThan(0);
+      });
     });
 
     describe.each(MODEL_VARIANTS.embedding)('Embedding Model: %s', modelId => {

diff --git a/examples/ai-core/src/generate-text/google-vertex-audio.ts b/examples/ai-core/src/generate-text/google-vertex-audio.ts
@@ -0,0 +1,30 @@
+import { vertex } from '@ai-sdk/google-vertex';
+import { generateText } from 'ai';
+import 'dotenv/config';
+import fs from 'node:fs';
+
+async function main() {
+  const result = await generateText({
+    model: vertex('gemini-1.5-flash', { audioTimestamp: true }),
+    messages: [
+      {
+        role: 'user',
+        content: [
+          {
+            type: 'text',
+            text: 'Output a transcript of spoken words. Break up transcript lines when there are pauses. Include timestamps in the format of HH:MM:SS.SSS.',
+          },
+          {
+            type: 'file',
+            data: Buffer.from(fs.readFileSync('./data/galileo.mp3')),
+            mimeType: 'audio/mpeg',
+          },
+        ],
+      },
+    ],
+  });
+
+  console.log(result.text);
+}
+
+main().catch(console.error);
diff --git a/packages/google/src/google-generative-ai-language-model.ts b/packages/google/src/google-generative-ai-language-model.ts
@@ -109,6 +109,7 @@ export class GoogleGenerativeAILanguageModel implements LanguageModelV1 {
         this.supportsStructuredOutputs
           ? convertJSONSchemaToOpenAPISchema(responseFormat.schema)
           : undefined,
+      audioTimestamp: this.settings.audioTimestamp,
     };
 
     const { contents, systemInstruction } =

diff --git a/packages/google/src/google-generative-ai-settings.ts b/packages/google/src/google-generative-ai-settings.ts
@@ -55,6 +55,15 @@ Optional. A list of unique safety settings for blocking unsafe content.
       | 'BLOCK_ONLY_HIGH'
       | 'BLOCK_NONE';
   }>;
+  /**
+   * Optional. Enables timestamp understanding for audio-only files.
+   * This is a preview feature.
+   *
+   * Available for the following models:
+   * - gemini-1.5-pro-002
+   * - gemini-1.5-flash-002
+   */
+  audioTimestamp?: boolean;
 
   /**
 Optional. When enabled, the model will use Google search to ground the response.