Implement 2-step audio-to-audio process

This commit is contained in:
Hamza-Ayed
2026-05-22 15:29:30 +03:00
parent ffd8c6f2a5
commit f1d57e2763
2 changed files with 22 additions and 133 deletions

View File

@@ -218,20 +218,19 @@ class GeminiService
*/ */
public static function generateAudioResponse(string $apiKey, string $systemPrompt, string $userMessage, string $voiceName = 'Puck'): ?array public static function generateAudioResponse(string $apiKey, string $systemPrompt, string $userMessage, string $voiceName = 'Puck'): ?array
{ {
$url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=' . $apiKey; $url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent?key=' . $apiKey;
$parts = [];
if (!empty($systemPrompt)) {
$parts[] = ['text' => "System instruction: " . $systemPrompt];
}
$parts[] = ['text' => $userMessage];
$payload = json_encode([ $payload = json_encode([
'contents' => [ 'contents' => [
[ [
'role' => 'user', 'role' => 'user',
'parts' => [ 'parts' => $parts
['text' => $userMessage]
]
]
],
'systemInstruction' => [
'parts' => [
['text' => $systemPrompt]
] ]
], ],
'generationConfig' => [ 'generationConfig' => [
@@ -280,72 +279,14 @@ class GeminiService
*/ */
public static function generateAudioResponseFromAudio(string $apiKey, string $systemPrompt, string $audioBase64, string $mimeType, string $voiceName = 'Puck'): ?array public static function generateAudioResponseFromAudio(string $apiKey, string $systemPrompt, string $audioBase64, string $mimeType, string $voiceName = 'Puck'): ?array
{ {
$url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=' . $apiKey; // Step 1: Use gemini-flash-lite-latest (which supports audio input) to understand the audio message and generate a text reply
$replyText = self::generateResponseFromAudio($apiKey, $systemPrompt, $audioBase64, $mimeType);
if (strpos($mimeType, ';') !== false) { if (empty($replyText)) {
$mimeType = trim(explode(';', $mimeType)[0]); error_log("[Gemini Audio-to-Audio Error] Could not generate text response from audio.");
}
$payload = json_encode([
'contents' => [
[
'role' => 'user',
'parts' => [
[
'inlineData' => [
'mimeType' => $mimeType,
'data' => $audioBase64
]
],
[
'text' => "استمع إلى التسجيل الصوتي المرفق وأجب عليه مباشرة بصوتك بناءً على الإرشادات المحددة."
]
]
]
],
'systemInstruction' => [
'parts' => [
['text' => $systemPrompt]
]
],
'generationConfig' => [
'responseModalities' => ['AUDIO'],
'speechConfig' => [
'voiceConfig' => [
'prebuiltVoiceConfig' => [
'voiceName' => $voiceName
]
]
]
]
]);
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Content-Type: application/json'
]);
curl_setopt($ch, CURLOPT_TIMEOUT, 45); // 45 seconds timeout for audio-to-audio generation
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode !== 200) {
error_log("[Gemini Audio-to-Audio Response Error] HTTP " . $httpCode . " | Response: " . $response);
return null; return null;
} }
$data = json_decode($response, true); // Step 2: Use gemini-3.1-flash-tts-preview to convert the text response into a native audio voice note
$part = $data['candidates'][0]['content']['parts'][0] ?? null; return self::generateAudioResponse($apiKey, $systemPrompt, $replyText, $voiceName);
if ($part && isset($part['inlineData'])) {
return [
'audio' => $part['inlineData']['data'],
'mimeType' => $part['inlineData']['mimeType'] ?? 'audio/mp4'
];
}
return null;
} }
} }

View File

@@ -135,71 +135,19 @@ if ($successfulModel && $audioResponse && !empty($audioResponse['audio'])) {
echo " [Gemini] Audio MimeType: " . $audioResponse['mimeType'] . "\n"; echo " [Gemini] Audio MimeType: " . $audioResponse['mimeType'] . "\n";
echo " [Gemini] Audio Size: " . strlen($audioResponse['audio']) . " base64 chars\n"; echo " [Gemini] Audio Size: " . strlen($audioResponse['audio']) . " base64 chars\n";
// 3. Test Audio-to-Audio conversion using the successful model // 3. Test Audio-to-Audio conversion using the Service
echo "\n--- Testing Audio-to-Audio (Speech-to-Speech) ---\n"; echo "\n--- Testing Audio-to-Audio (Speech-to-Speech) via GeminiService ---\n";
$startTime = microtime(true); $startTime = microtime(true);
// We temporarily override the model inside GeminiService for this test, but since GeminiService is not updated yet, $audioResponse2 = GeminiService::generateAudioResponseFromAudio($apiKey, $systemPrompt, $audioResponse['audio'], $audioResponse['mimeType'], 'Puck');
// we'll run a direct curl call for testing audio-to-audio:
$url = 'https://generativelanguage.googleapis.com/v1beta/models/' . $successfulModel . ':generateContent?key=' . $apiKey;
$payload2 = json_encode([
'contents' => [
[
'role' => 'user',
'parts' => [
[
'inlineData' => [
'mimeType' => $audioResponse['mimeType'],
'data' => $audioResponse['audio']
]
],
[
'text' => "استمع إلى التسجيل الصوتي المرفق وأجب عليه مباشرة بصوتك بناءً على الإرشادات المحددة."
]
]
]
],
'systemInstruction' => [
'parts' => [['text' => $systemPrompt]]
],
'generationConfig' => [
'responseModalities' => ['AUDIO'],
'speechConfig' => [
'voiceConfig' => [
'prebuiltVoiceConfig' => [
'voiceName' => 'Puck'
]
]
]
]
]);
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload2);
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
$response2 = curl_exec($ch);
$httpCode2 = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
$elapsedTime2 = round(microtime(true) - $startTime, 2); $elapsedTime2 = round(microtime(true) - $startTime, 2);
if ($httpCode2 === 200) { if ($audioResponse2 && !empty($audioResponse2['audio'])) {
$data2 = json_decode($response2, true);
$part2 = $data2['candidates'][0]['content']['parts'][0] ?? null;
if ($part2 && isset($part2['inlineData'])) {
echo "✅ [Gemini] Successfully generated Audio-to-Audio response in {$elapsedTime2} seconds!\n"; echo "✅ [Gemini] Successfully generated Audio-to-Audio response in {$elapsedTime2} seconds!\n";
echo " [Gemini] Audio MimeType: " . ($part2['inlineData']['mimeType'] ?? 'audio/mp4') . "\n"; echo " [Gemini] Audio MimeType: " . $audioResponse2['mimeType'] . "\n";
echo " [Gemini] Audio Size: " . strlen($part2['inlineData']['data']) . " base64 chars\n"; echo " [Gemini] Audio Size: " . strlen($audioResponse2['audio']) . " base64 chars\n";
} else { } else {
echo "❌ [Gemini] Audio-to-Audio response did not contain inlineData.\n"; echo "❌ [Gemini] Audio-to-Audio generation failed.\n";
}
} else {
$errorData2 = json_decode($response2, true);
$errMsg2 = $errorData2['error']['message'] ?? 'Unknown error';
echo "❌ [Gemini] Audio-to-Audio generation failed. HTTP {$httpCode2}: {$errMsg2}\n";
} }
} else { } else {
echo "\n❌ [Gemini] All model trials for audio response generation failed.\n"; echo "\n❌ [Gemini] All model trials for audio response generation failed.\n";