From 2e7cd11f855a8972d91a540e47e3f79089612caa Mon Sep 17 00:00:00 2001 From: Hamza-Ayed Date: Fri, 22 May 2026 15:10:13 +0300 Subject: [PATCH] Implement Gemini audio voice note replies --- .../app/Controllers/WhatsAppController.php | 30 ++-- backend/app/Services/GeminiService.php | 136 ++++++++++++++++++ backend/public/test_audio.php | 88 ++++++++++++ whatsapp-gateway/baileys-client.js | 11 +- whatsapp-gateway/server.js | 12 +- 5 files changed, 264 insertions(+), 13 deletions(-) create mode 100644 backend/public/test_audio.php diff --git a/backend/app/Controllers/WhatsAppController.php b/backend/app/Controllers/WhatsAppController.php index c84d9d8..05e80ea 100644 --- a/backend/app/Controllers/WhatsAppController.php +++ b/backend/app/Controllers/WhatsAppController.php @@ -318,6 +318,7 @@ class WhatsAppController extends BaseController } $replyText = null; + $replyAudio = null; if ($rule['trigger_type'] === 'keyword') { if (empty($incomingText)) { @@ -372,7 +373,15 @@ class WhatsAppController extends BaseController if (strpos($mimeType, ';') !== false) { $mimeType = trim(explode(';', $mimeType)[0]); } - $replyText = \App\Services\GeminiService::generateResponseFromAudio($apiKey, $systemPrompt, $msgData['audio'], $mimeType); + // Try generating native audio response first + $audioResponse = \App\Services\GeminiService::generateAudioResponseFromAudio($apiKey, $systemPrompt, $msgData['audio'], $mimeType); + if ($audioResponse && !empty($audioResponse['audio'])) { + $replyAudio = $audioResponse['audio']; + $replyText = '[صوت من الذكاء الاصطناعي]'; + } else { + // Fallback to text output from audio + $replyText = \App\Services\GeminiService::generateResponseFromAudio($apiKey, $systemPrompt, $msgData['audio'], $mimeType); + } } elseif ($hasImage) { $mimeType = $msgData['imageMimeType']; if (strpos($mimeType, ';') !== false) { @@ -388,9 +397,9 @@ class WhatsAppController extends BaseController } } - if (!empty($replyText)) { + if (!empty($replyText) || !empty($replyAudio)) { // Check if the reply contains [PAYMENT_RECEIPT: { ... }] tag from Gemini - if (preg_match('/\[PAYMENT_RECEIPT:\s*(\{.*?\})\]/s', $replyText, $matches)) { + if (!empty($replyText) && preg_match('/\[PAYMENT_RECEIPT:\s*(\{.*?\})\]/s', $replyText, $matches)) { $jsonStr = $matches[1]; // Strip the tag from the final reply sent to user $replyText = trim(str_replace($matches[0], '', $replyText)); @@ -410,11 +419,16 @@ class WhatsAppController extends BaseController $sendUrl = $gatewayUrl . '/api/messages/send'; } - $payload = json_encode([ + $payloadData = [ 'session_key' => $session['session_key'], - 'phone' => $msgData['phone'], - 'message' => $replyText - ]); + 'phone' => $msgData['phone'] + ]; + if (!empty($replyAudio)) { + $payloadData['audio'] = $replyAudio; + } else { + $payloadData['message'] = $replyText; + } + $payload = json_encode($payloadData); $ch = curl_init($sendUrl); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); @@ -450,7 +464,7 @@ class WhatsAppController extends BaseController 'session_id' => $session['id'], 'contact_phone' => $msgData['phone'], 'direction' => 'outbound', - 'message_type' => 'text', + 'message_type' => !empty($replyAudio) ? 'audio' : 'text', 'message_body' => $replyText, 'whatsapp_message_id' => $waMsgId, 'status' => $status, diff --git a/backend/app/Services/GeminiService.php b/backend/app/Services/GeminiService.php index 684d0f5..91e1d0e 100644 --- a/backend/app/Services/GeminiService.php +++ b/backend/app/Services/GeminiService.php @@ -212,4 +212,140 @@ class GeminiService $data = json_decode($response, true); return $data['candidates'][0]['content']['parts'][0]['text'] ?? null; } + + /** + * Call Gemini API to generate a native audio (speech) response from text + */ + public static function generateAudioResponse(string $apiKey, string $systemPrompt, string $userMessage, string $voiceName = 'Puck'): ?array + { + $url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-lite-latest:generateContent?key=' . $apiKey; + + $payload = json_encode([ + 'contents' => [ + [ + 'role' => 'user', + 'parts' => [ + ['text' => $userMessage] + ] + ] + ], + 'systemInstruction' => [ + 'parts' => [ + ['text' => $systemPrompt] + ] + ], + 'generationConfig' => [ + 'responseModalities' => ['AUDIO'], + 'speechConfig' => [ + 'voiceConfig' => [ + 'prebuiltVoiceConfig' => [ + 'voiceName' => $voiceName + ] + ] + ] + ] + ]); + + $ch = curl_init($url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_POST, true); + curl_setopt($ch, CURLOPT_POSTFIELDS, $payload); + curl_setopt($ch, CURLOPT_HTTPHEADER, [ + 'Content-Type: application/json' + ]); + curl_setopt($ch, CURLOPT_TIMEOUT, 30); + + $response = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + + if ($httpCode !== 200) { + error_log("[Gemini Audio API Error] HTTP " . $httpCode . " | Response: " . $response); + return null; + } + + $data = json_decode($response, true); + $part = $data['candidates'][0]['content']['parts'][0] ?? null; + if ($part && isset($part['inlineData'])) { + return [ + 'audio' => $part['inlineData']['data'], + 'mimeType' => $part['inlineData']['mimeType'] ?? 'audio/mp4' + ]; + } + return null; + } + + /** + * Call Gemini API with audio inline data to generate a native audio response + */ + public static function generateAudioResponseFromAudio(string $apiKey, string $systemPrompt, string $audioBase64, string $mimeType, string $voiceName = 'Puck'): ?array + { + $url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-lite-latest:generateContent?key=' . $apiKey; + + if (strpos($mimeType, ';') !== false) { + $mimeType = trim(explode(';', $mimeType)[0]); + } + + $payload = json_encode([ + 'contents' => [ + [ + 'role' => 'user', + 'parts' => [ + [ + 'inlineData' => [ + 'mimeType' => $mimeType, + 'data' => $audioBase64 + ] + ], + [ + 'text' => "استمع إلى التسجيل الصوتي المرفق وأجب عليه مباشرة بصوتك بناءً على الإرشادات المحددة." + ] + ] + ] + ], + 'systemInstruction' => [ + 'parts' => [ + ['text' => $systemPrompt] + ] + ], + 'generationConfig' => [ + 'responseModalities' => ['AUDIO'], + 'speechConfig' => [ + 'voiceConfig' => [ + 'prebuiltVoiceConfig' => [ + 'voiceName' => $voiceName + ] + ] + ] + ] + ]); + + $ch = curl_init($url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_POST, true); + curl_setopt($ch, CURLOPT_POSTFIELDS, $payload); + curl_setopt($ch, CURLOPT_HTTPHEADER, [ + 'Content-Type: application/json' + ]); + curl_setopt($ch, CURLOPT_TIMEOUT, 45); // 45 seconds timeout for audio-to-audio generation + + $response = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + + if ($httpCode !== 200) { + error_log("[Gemini Audio-to-Audio Response Error] HTTP " . $httpCode . " | Response: " . $response); + return null; + } + + $data = json_decode($response, true); + $part = $data['candidates'][0]['content']['parts'][0] ?? null; + if ($part && isset($part['inlineData'])) { + return [ + 'audio' => $part['inlineData']['data'], + 'mimeType' => $part['inlineData']['mimeType'] ?? 'audio/mp4' + ]; + } + return null; + } } diff --git a/backend/public/test_audio.php b/backend/public/test_audio.php new file mode 100644 index 0000000..f6f5b89 --- /dev/null +++ b/backend/public/test_audio.php @@ -0,0 +1,88 @@ + { // Send outbound message app.post('/api/messages/send', async (req, res) => { - const { session_key, phone, message, media_url } = req.body; + const { session_key, phone, message, media_url, audio } = req.body; - if (!session_key || !phone || !message) { - return res.status(400).json({ error: 'Missing session_key, phone, or message' }); + if (!session_key || !phone) { + return res.status(400).json({ error: 'Missing session_key or phone' }); + } + + if (!message && !audio) { + return res.status(400).json({ error: 'Missing message or audio' }); } try { - const result = await sendMessage(session_key, phone, message, media_url); + const result = await sendMessage(session_key, phone, message, media_url, audio); res.json({ status: 'success', data: result }); } catch (err) { console.error(`Error sending message via ${session_key} to ${phone}:`, err);