diff --git a/backend/app/Services/GeminiService.php b/backend/app/Services/GeminiService.php index 0438219..f79df78 100644 --- a/backend/app/Services/GeminiService.php +++ b/backend/app/Services/GeminiService.php @@ -218,20 +218,19 @@ class GeminiService */ public static function generateAudioResponse(string $apiKey, string $systemPrompt, string $userMessage, string $voiceName = 'Puck'): ?array { - $url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=' . $apiKey; + $url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent?key=' . $apiKey; + + $parts = []; + if (!empty($systemPrompt)) { + $parts[] = ['text' => "System instruction: " . $systemPrompt]; + } + $parts[] = ['text' => $userMessage]; $payload = json_encode([ 'contents' => [ [ 'role' => 'user', - 'parts' => [ - ['text' => $userMessage] - ] - ] - ], - 'systemInstruction' => [ - 'parts' => [ - ['text' => $systemPrompt] + 'parts' => $parts ] ], 'generationConfig' => [ @@ -280,72 +279,14 @@ class GeminiService */ public static function generateAudioResponseFromAudio(string $apiKey, string $systemPrompt, string $audioBase64, string $mimeType, string $voiceName = 'Puck'): ?array { - $url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=' . $apiKey; - - if (strpos($mimeType, ';') !== false) { - $mimeType = trim(explode(';', $mimeType)[0]); - } - - $payload = json_encode([ - 'contents' => [ - [ - 'role' => 'user', - 'parts' => [ - [ - 'inlineData' => [ - 'mimeType' => $mimeType, - 'data' => $audioBase64 - ] - ], - [ - 'text' => "استمع إلى التسجيل الصوتي المرفق وأجب عليه مباشرة بصوتك بناءً على الإرشادات المحددة." - ] - ] - ] - ], - 'systemInstruction' => [ - 'parts' => [ - ['text' => $systemPrompt] - ] - ], - 'generationConfig' => [ - 'responseModalities' => ['AUDIO'], - 'speechConfig' => [ - 'voiceConfig' => [ - 'prebuiltVoiceConfig' => [ - 'voiceName' => $voiceName - ] - ] - ] - ] - ]); - - $ch = curl_init($url); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_POST, true); - curl_setopt($ch, CURLOPT_POSTFIELDS, $payload); - curl_setopt($ch, CURLOPT_HTTPHEADER, [ - 'Content-Type: application/json' - ]); - curl_setopt($ch, CURLOPT_TIMEOUT, 45); // 45 seconds timeout for audio-to-audio generation - - $response = curl_exec($ch); - $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); - curl_close($ch); - - if ($httpCode !== 200) { - error_log("[Gemini Audio-to-Audio Response Error] HTTP " . $httpCode . " | Response: " . $response); + // Step 1: Use gemini-flash-lite-latest (which supports audio input) to understand the audio message and generate a text reply + $replyText = self::generateResponseFromAudio($apiKey, $systemPrompt, $audioBase64, $mimeType); + if (empty($replyText)) { + error_log("[Gemini Audio-to-Audio Error] Could not generate text response from audio."); return null; } - $data = json_decode($response, true); - $part = $data['candidates'][0]['content']['parts'][0] ?? null; - if ($part && isset($part['inlineData'])) { - return [ - 'audio' => $part['inlineData']['data'], - 'mimeType' => $part['inlineData']['mimeType'] ?? 'audio/mp4' - ]; - } - return null; + // Step 2: Use gemini-3.1-flash-tts-preview to convert the text response into a native audio voice note + return self::generateAudioResponse($apiKey, $systemPrompt, $replyText, $voiceName); } } diff --git a/backend/public/test_audio.php b/backend/public/test_audio.php index 9bc05a2..92900ea 100644 --- a/backend/public/test_audio.php +++ b/backend/public/test_audio.php @@ -135,71 +135,19 @@ if ($successfulModel && $audioResponse && !empty($audioResponse['audio'])) { echo "ℹ️ [Gemini] Audio MimeType: " . $audioResponse['mimeType'] . "\n"; echo "ℹ️ [Gemini] Audio Size: " . strlen($audioResponse['audio']) . " base64 chars\n"; - // 3. Test Audio-to-Audio conversion using the successful model - echo "\n--- Testing Audio-to-Audio (Speech-to-Speech) ---\n"; + // 3. Test Audio-to-Audio conversion using the Service + echo "\n--- Testing Audio-to-Audio (Speech-to-Speech) via GeminiService ---\n"; $startTime = microtime(true); - // We temporarily override the model inside GeminiService for this test, but since GeminiService is not updated yet, - // we'll run a direct curl call for testing audio-to-audio: - $url = 'https://generativelanguage.googleapis.com/v1beta/models/' . $successfulModel . ':generateContent?key=' . $apiKey; - $payload2 = json_encode([ - 'contents' => [ - [ - 'role' => 'user', - 'parts' => [ - [ - 'inlineData' => [ - 'mimeType' => $audioResponse['mimeType'], - 'data' => $audioResponse['audio'] - ] - ], - [ - 'text' => "استمع إلى التسجيل الصوتي المرفق وأجب عليه مباشرة بصوتك بناءً على الإرشادات المحددة." - ] - ] - ] - ], - 'systemInstruction' => [ - 'parts' => [['text' => $systemPrompt]] - ], - 'generationConfig' => [ - 'responseModalities' => ['AUDIO'], - 'speechConfig' => [ - 'voiceConfig' => [ - 'prebuiltVoiceConfig' => [ - 'voiceName' => 'Puck' - ] - ] - ] - ] - ]); - - $ch = curl_init($url); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_POST, true); - curl_setopt($ch, CURLOPT_POSTFIELDS, $payload2); - curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']); - curl_setopt($ch, CURLOPT_TIMEOUT, 30); - $response2 = curl_exec($ch); - $httpCode2 = curl_getinfo($ch, CURLINFO_HTTP_CODE); - curl_close($ch); - + $audioResponse2 = GeminiService::generateAudioResponseFromAudio($apiKey, $systemPrompt, $audioResponse['audio'], $audioResponse['mimeType'], 'Puck'); $elapsedTime2 = round(microtime(true) - $startTime, 2); - if ($httpCode2 === 200) { - $data2 = json_decode($response2, true); - $part2 = $data2['candidates'][0]['content']['parts'][0] ?? null; - if ($part2 && isset($part2['inlineData'])) { - echo "✅ [Gemini] Successfully generated Audio-to-Audio response in {$elapsedTime2} seconds!\n"; - echo "ℹ️ [Gemini] Audio MimeType: " . ($part2['inlineData']['mimeType'] ?? 'audio/mp4') . "\n"; - echo "ℹ️ [Gemini] Audio Size: " . strlen($part2['inlineData']['data']) . " base64 chars\n"; - } else { - echo "❌ [Gemini] Audio-to-Audio response did not contain inlineData.\n"; - } + if ($audioResponse2 && !empty($audioResponse2['audio'])) { + echo "✅ [Gemini] Successfully generated Audio-to-Audio response in {$elapsedTime2} seconds!\n"; + echo "ℹ️ [Gemini] Audio MimeType: " . $audioResponse2['mimeType'] . "\n"; + echo "ℹ️ [Gemini] Audio Size: " . strlen($audioResponse2['audio']) . " base64 chars\n"; } else { - $errorData2 = json_decode($response2, true); - $errMsg2 = $errorData2['error']['message'] ?? 'Unknown error'; - echo "❌ [Gemini] Audio-to-Audio generation failed. HTTP {$httpCode2}: {$errMsg2}\n"; + echo "❌ [Gemini] Audio-to-Audio generation failed.\n"; } } else { echo "\n❌ [Gemini] All model trials for audio response generation failed.\n";