Skip to content

Commit 3d18df6

Browse files
authored
Merge 2ee57f4 into 264ebad
2 parents 264ebad + 2ee57f4 commit 3d18df6

File tree

9 files changed

+286
-22
lines changed

9 files changed

+286
-22
lines changed

firebase-ai/api.txt

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ package com.google.firebase.ai.java {
154154
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(boolean enableInterruptions);
155155
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler);
156156
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler, boolean enableInterruptions);
157+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler, kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler, boolean enableInterruptions);
158+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler);
159+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler, boolean enableInterruptions);
157160
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> stopAudioConversation();
158161
method public abstract void stopReceiving();
159162
field public static final com.google.firebase.ai.java.LiveSessionFutures.Companion Companion;
@@ -174,6 +177,10 @@ package com.google.firebase.ai.type {
174177
ctor public AudioRecordInitializationFailedException(String message);
175178
}
176179

180+
public final class AudioTranscriptionConfig {
181+
ctor public AudioTranscriptionConfig();
182+
}
183+
177184
public final class BlockReason {
178185
method public String getName();
179186
method public int getOrdinal();
@@ -839,15 +846,19 @@ package com.google.firebase.ai.type {
839846
ctor public LiveGenerationConfig.Builder();
840847
method public com.google.firebase.ai.type.LiveGenerationConfig build();
841848
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setFrequencyPenalty(Float? frequencyPenalty);
849+
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setInputAudioTranscription(com.google.firebase.ai.type.AudioTranscriptionConfig? config);
842850
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setMaxOutputTokens(Integer? maxOutputTokens);
851+
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setOutputAudioTranscription(com.google.firebase.ai.type.AudioTranscriptionConfig? config);
843852
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setPresencePenalty(Float? presencePenalty);
844853
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setResponseModality(com.google.firebase.ai.type.ResponseModality? responseModality);
845854
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setSpeechConfig(com.google.firebase.ai.type.SpeechConfig? speechConfig);
846855
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTemperature(Float? temperature);
847856
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTopK(Integer? topK);
848857
method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTopP(Float? topP);
849858
field public Float? frequencyPenalty;
859+
field public com.google.firebase.ai.type.AudioTranscriptionConfig? inputAudioTranscription;
850860
field public Integer? maxOutputTokens;
861+
field public com.google.firebase.ai.type.AudioTranscriptionConfig? outputAudioTranscription;
851862
field public Float? presencePenalty;
852863
field public com.google.firebase.ai.type.ResponseModality? responseModality;
853864
field public com.google.firebase.ai.type.SpeechConfig? speechConfig;
@@ -865,14 +876,18 @@ package com.google.firebase.ai.type {
865876
}
866877

867878
@com.google.firebase.ai.type.PublicPreviewAPI public final class LiveServerContent implements com.google.firebase.ai.type.LiveServerMessage {
868-
ctor public LiveServerContent(com.google.firebase.ai.type.Content? content, boolean interrupted, boolean turnComplete, boolean generationComplete);
879+
ctor @Deprecated public LiveServerContent(com.google.firebase.ai.type.Content? content, boolean interrupted, boolean turnComplete, boolean generationComplete, com.google.firebase.ai.type.Transcription? inputTranscription, com.google.firebase.ai.type.Transcription? outputTranscription);
869880
method public com.google.firebase.ai.type.Content? getContent();
870881
method public boolean getGenerationComplete();
882+
method public com.google.firebase.ai.type.Transcription? getInputTranscription();
871883
method public boolean getInterrupted();
884+
method public com.google.firebase.ai.type.Transcription? getOutputTranscription();
872885
method public boolean getTurnComplete();
873886
property public final com.google.firebase.ai.type.Content? content;
874887
property public final boolean generationComplete;
888+
property public final com.google.firebase.ai.type.Transcription? inputTranscription;
875889
property public final boolean interrupted;
890+
property public final com.google.firebase.ai.type.Transcription? outputTranscription;
876891
property public final boolean turnComplete;
877892
}
878893

@@ -909,6 +924,7 @@ package com.google.firebase.ai.type {
909924
method public suspend Object? sendVideoRealtime(com.google.firebase.ai.type.InlineData video, kotlin.coroutines.Continuation<? super kotlin.Unit>);
910925
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation<? super kotlin.Unit>);
911926
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, kotlin.coroutines.Continuation<? super kotlin.Unit>);
927+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation<? super kotlin.Unit>);
912928
method public void stopAudioConversation();
913929
method public void stopReceiving();
914930
}
@@ -1235,6 +1251,11 @@ package com.google.firebase.ai.type {
12351251
ctor public ToolConfig(com.google.firebase.ai.type.FunctionCallingConfig? functionCallingConfig);
12361252
}
12371253

1254+
public final class Transcription {
1255+
method public String? getText();
1256+
property public final String? text;
1257+
}
1258+
12381259
public final class UnknownException extends com.google.firebase.ai.type.FirebaseAIException {
12391260
}
12401261

firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,9 @@ internal constructor(
111111
modelName,
112112
config?.toInternal(),
113113
tools?.map { it.toInternal() },
114-
systemInstruction?.toInternal()
114+
systemInstruction?.toInternal(),
115+
config?.inputAudioTranscription?.toInternal(),
116+
config?.outputAudioTranscription?.toInternal()
115117
)
116118
.toInternal()
117119
val data: String = Json.encodeToString(clientMessage)
@@ -135,7 +137,7 @@ internal constructor(
135137
} catch (e: ClosedReceiveChannelException) {
136138
val reason = webSession?.closeReason?.await()
137139
val message =
138-
"Channel was closed by the server.${if(reason!=null) " Details: ${reason.message}" else "" }"
140+
"Channel was closed by the server.${if (reason != null) " Details: ${reason.message}" else ""}"
139141
throw ServiceConnectionHandshakeFailedException(message, e)
140142
}
141143
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import com.google.firebase.ai.type.LiveSession
2929
import com.google.firebase.ai.type.MediaData
3030
import com.google.firebase.ai.type.PublicPreviewAPI
3131
import com.google.firebase.ai.type.SessionAlreadyReceivingException
32+
import com.google.firebase.ai.type.Transcription
3233
import io.ktor.websocket.close
3334
import kotlinx.coroutines.reactive.asPublisher
3435
import org.reactivestreams.Publisher
@@ -41,6 +42,13 @@ import org.reactivestreams.Publisher
4142
@PublicPreviewAPI
4243
public abstract class LiveSessionFutures internal constructor() {
4344

45+
/**
46+
* Starts an audio conversation with the model, which can only be stopped using
47+
* [stopAudioConversation].
48+
*/
49+
@RequiresPermission(RECORD_AUDIO)
50+
public abstract fun startAudioConversation(): ListenableFuture<Unit>
51+
4452
/**
4553
* Starts an audio conversation with the model, which can only be stopped using
4654
* [stopAudioConversation] or [close].
@@ -56,9 +64,14 @@ public abstract class LiveSessionFutures internal constructor() {
5664
/**
5765
* Starts an audio conversation with the model, which can only be stopped using
5866
* [stopAudioConversation].
67+
* @param transcriptHandler A callback function that is invoked whenever the model receives a
68+
* transcript. The first [Transcription] object is the input transcription, and the second is the
69+
* output transcription
5970
*/
6071
@RequiresPermission(RECORD_AUDIO)
61-
public abstract fun startAudioConversation(): ListenableFuture<Unit>
72+
public abstract fun startAudioConversation(
73+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
74+
): ListenableFuture<Unit>
6275

6376
/**
6477
* Starts an audio conversation with the model, which can only be stopped using
@@ -73,6 +86,26 @@ public abstract class LiveSessionFutures internal constructor() {
7386
@RequiresPermission(RECORD_AUDIO)
7487
public abstract fun startAudioConversation(enableInterruptions: Boolean): ListenableFuture<Unit>
7588

89+
/**
90+
* Starts an audio conversation with the model, which can only be stopped using
91+
* [stopAudioConversation] or [close].
92+
*
93+
* @param transcriptHandler A callback function that is invoked whenever the model receives a
94+
* transcript. The first [Transcription] object is the input transcription, and the second is the
95+
* output transcription
96+
*
97+
* @param enableInterruptions If enabled, allows the user to speak over or interrupt the model's
98+
* ongoing reply.
99+
*
100+
* **WARNING**: The user interruption feature relies on device-specific support, and may not be
101+
* consistently available.
102+
*/
103+
@RequiresPermission(RECORD_AUDIO)
104+
public abstract fun startAudioConversation(
105+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
106+
enableInterruptions: Boolean
107+
): ListenableFuture<Unit>
108+
76109
/**
77110
* Starts an audio conversation with the model, which can only be stopped using
78111
* [stopAudioConversation] or [close].
@@ -92,6 +125,30 @@ public abstract class LiveSessionFutures internal constructor() {
92125
enableInterruptions: Boolean
93126
): ListenableFuture<Unit>
94127

128+
/**
129+
* Starts an audio conversation with the model, which can only be stopped using
130+
* [stopAudioConversation] or [close].
131+
*
132+
* @param functionCallHandler A callback function that is invoked whenever the model receives a
133+
* function call.
134+
*
135+
* @param transcriptHandler A callback function that is invoked whenever the model receives a
136+
* transcript. The first [Transcription] object is the input transcription, and the second is the
137+
* output transcription
138+
*
139+
* @param enableInterruptions If enabled, allows the user to speak over or interrupt the model's
140+
* ongoing reply.
141+
*
142+
* **WARNING**: The user interruption feature relies on device-specific support, and may not be
143+
* consistently available.
144+
*/
145+
@RequiresPermission(RECORD_AUDIO)
146+
public abstract fun startAudioConversation(
147+
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,
148+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
149+
enableInterruptions: Boolean
150+
): ListenableFuture<Unit>
151+
95152
/**
96153
* Stops the audio conversation with the Gemini Server.
97154
*
@@ -233,6 +290,14 @@ public abstract class LiveSessionFutures internal constructor() {
233290
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?
234291
) = SuspendToFutureAdapter.launchFuture { session.startAudioConversation(functionCallHandler) }
235292

293+
@RequiresPermission(RECORD_AUDIO)
294+
override fun startAudioConversation(
295+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?
296+
) =
297+
SuspendToFutureAdapter.launchFuture {
298+
session.startAudioConversation(transcriptHandler = transcriptHandler)
299+
}
300+
236301
@RequiresPermission(RECORD_AUDIO)
237302
override fun startAudioConversation() =
238303
SuspendToFutureAdapter.launchFuture { session.startAudioConversation() }
@@ -243,6 +308,32 @@ public abstract class LiveSessionFutures internal constructor() {
243308
session.startAudioConversation(enableInterruptions = enableInterruptions)
244309
}
245310

311+
@RequiresPermission(RECORD_AUDIO)
312+
override fun startAudioConversation(
313+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
314+
enableInterruptions: Boolean
315+
) =
316+
SuspendToFutureAdapter.launchFuture {
317+
session.startAudioConversation(
318+
transcriptHandler = transcriptHandler,
319+
enableInterruptions = enableInterruptions
320+
)
321+
}
322+
323+
@RequiresPermission(RECORD_AUDIO)
324+
override fun startAudioConversation(
325+
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,
326+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
327+
enableInterruptions: Boolean
328+
) =
329+
SuspendToFutureAdapter.launchFuture {
330+
session.startAudioConversation(
331+
functionCallHandler = functionCallHandler,
332+
transcriptHandler = transcriptHandler,
333+
enableInterruptions = enableInterruptions
334+
)
335+
}
336+
246337
@RequiresPermission(RECORD_AUDIO)
247338
override fun startAudioConversation(
248339
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.firebase.ai.type
18+
19+
import kotlinx.serialization.Serializable
20+
21+
/** The audio transcription configuration. Its presence enables audio transcription */
22+
public class AudioTranscriptionConfig {
23+
24+
@Serializable internal object Internal
25+
26+
internal fun toInternal() = Internal
27+
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveClientSetupMessage.kt

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ internal class LiveClientSetupMessage(
3232
// needs its own config class
3333
val generationConfig: LiveGenerationConfig.Internal?,
3434
val tools: List<Tool.Internal>?,
35-
val systemInstruction: Content.Internal?
35+
val systemInstruction: Content.Internal?,
36+
val inputAudioTranscription: AudioTranscriptionConfig.Internal?,
37+
val outputAudioTranscription: AudioTranscriptionConfig.Internal?,
3638
) {
3739
@Serializable
3840
internal class Internal(val setup: LiveClientSetup) {
@@ -41,10 +43,21 @@ internal class LiveClientSetupMessage(
4143
val model: String,
4244
val generationConfig: LiveGenerationConfig.Internal?,
4345
val tools: List<Tool.Internal>?,
44-
val systemInstruction: Content.Internal?
46+
val systemInstruction: Content.Internal?,
47+
val inputAudioTranscription: AudioTranscriptionConfig.Internal?,
48+
val outputAudioTranscription: AudioTranscriptionConfig.Internal?,
4549
)
4650
}
4751

4852
fun toInternal() =
49-
Internal(Internal.LiveClientSetup(model, generationConfig, tools, systemInstruction))
53+
Internal(
54+
Internal.LiveClientSetup(
55+
model,
56+
generationConfig,
57+
tools,
58+
systemInstruction,
59+
inputAudioTranscription,
60+
outputAudioTranscription
61+
)
62+
)
5063
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveGenerationConfig.kt

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ import kotlinx.serialization.Serializable
5353
*
5454
* @property speechConfig Specifies the voice configuration of the audio response from the server.
5555
*
56+
* @property inputAudioTranscription Specifies the configuration for transcribing input audio.
57+
*
58+
* @property outputAudioTranscription Specifies the configuration for transcribing output audio from
59+
* the model.
60+
*
5661
* Refer to the
5762
* [Control generated output](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/control-generated-output)
5863
* guide for more details.
@@ -67,7 +72,9 @@ private constructor(
6772
internal val presencePenalty: Float?,
6873
internal val frequencyPenalty: Float?,
6974
internal val responseModality: ResponseModality?,
70-
internal val speechConfig: SpeechConfig?
75+
internal val speechConfig: SpeechConfig?,
76+
internal val inputAudioTranscription: AudioTranscriptionConfig?,
77+
internal val outputAudioTranscription: AudioTranscriptionConfig?,
7178
) {
7279

7380
/**
@@ -91,6 +98,10 @@ private constructor(
9198
* @property responseModality See [LiveGenerationConfig.responseModality]
9299
*
93100
* @property speechConfig See [LiveGenerationConfig.speechConfig]
101+
*
102+
* @property inputAudioTranscription see [LiveGenerationConfig.inputAudioTranscription]
103+
*
104+
* @property outputAudioTranscription see [LiveGenerationConfig.outputAudioTranscription]
94105
*/
95106
public class Builder {
96107
@JvmField public var temperature: Float? = null
@@ -101,6 +112,8 @@ private constructor(
101112
@JvmField public var frequencyPenalty: Float? = null
102113
@JvmField public var responseModality: ResponseModality? = null
103114
@JvmField public var speechConfig: SpeechConfig? = null
115+
@JvmField public var inputAudioTranscription: AudioTranscriptionConfig? = null
116+
@JvmField public var outputAudioTranscription: AudioTranscriptionConfig? = null
104117

105118
public fun setTemperature(temperature: Float?): Builder = apply {
106119
this.temperature = temperature
@@ -123,6 +136,14 @@ private constructor(
123136
this.speechConfig = speechConfig
124137
}
125138

139+
public fun setInputAudioTranscription(config: AudioTranscriptionConfig?): Builder = apply {
140+
this.inputAudioTranscription = config
141+
}
142+
143+
public fun setOutputAudioTranscription(config: AudioTranscriptionConfig?): Builder = apply {
144+
this.outputAudioTranscription = config
145+
}
146+
126147
/** Create a new [LiveGenerationConfig] with the attached arguments. */
127148
public fun build(): LiveGenerationConfig =
128149
LiveGenerationConfig(
@@ -133,7 +154,9 @@ private constructor(
133154
presencePenalty = presencePenalty,
134155
frequencyPenalty = frequencyPenalty,
135156
speechConfig = speechConfig,
136-
responseModality = responseModality
157+
responseModality = responseModality,
158+
inputAudioTranscription = inputAudioTranscription,
159+
outputAudioTranscription = outputAudioTranscription,
137160
)
138161
}
139162

0 commit comments

Comments
 (0)