Implement Gemini audio voice note replies

2026-05-22 15:10:13 +03:00
parent 395c8ee8eb
commit 2e7cd11f85
5 changed files with 264 additions and 13 deletions
--- a/backend/app/Controllers/WhatsAppController.php
+++ b/backend/app/Controllers/WhatsAppController.php
@@ -318,6 +318,7 @@ class WhatsAppController extends BaseController
            }

            $replyText = null;
+            $replyAudio = null;

            if ($rule['trigger_type'] === 'keyword') {
                if (empty($incomingText)) {
@@ -372,7 +373,15 @@ class WhatsAppController extends BaseController
                    if (strpos($mimeType, ';') !== false) {
                        $mimeType = trim(explode(';', $mimeType)[0]);
                    }
-                    $replyText = \App\Services\GeminiService::generateResponseFromAudio($apiKey, $systemPrompt, $msgData['audio'], $mimeType);
+                    // Try generating native audio response first
+                    $audioResponse = \App\Services\GeminiService::generateAudioResponseFromAudio($apiKey, $systemPrompt, $msgData['audio'], $mimeType);
+                    if ($audioResponse && !empty($audioResponse['audio'])) {
+                        $replyAudio = $audioResponse['audio'];
+                        $replyText = '[صوت من الذكاء الاصطناعي]';
+                    } else {
+                        // Fallback to text output from audio
+                        $replyText = \App\Services\GeminiService::generateResponseFromAudio($apiKey, $systemPrompt, $msgData['audio'], $mimeType);
+                    }
                } elseif ($hasImage) {
                    $mimeType = $msgData['imageMimeType'];
                    if (strpos($mimeType, ';') !== false) {
@@ -388,9 +397,9 @@ class WhatsAppController extends BaseController
                }
            }

-            if (!empty($replyText)) {
+            if (!empty($replyText) || !empty($replyAudio)) {
                // Check if the reply contains [PAYMENT_RECEIPT: { ... }] tag from Gemini
-                if (preg_match('/\[PAYMENT_RECEIPT:\s*(\{.*?\})\]/s', $replyText, $matches)) {
+                if (!empty($replyText) && preg_match('/\[PAYMENT_RECEIPT:\s*(\{.*?\})\]/s', $replyText, $matches)) {
                    $jsonStr = $matches[1];
                    // Strip the tag from the final reply sent to user
                    $replyText = trim(str_replace($matches[0], '', $replyText));
@@ -410,11 +419,16 @@ class WhatsAppController extends BaseController
                    $sendUrl = $gatewayUrl . '/api/messages/send';
                }

-                $payload = json_encode([
+                $payloadData = [
                    'session_key' => $session['session_key'],
-                    'phone' => $msgData['phone'],
-                    'message' => $replyText
-                ]);
+                    'phone' => $msgData['phone']
+                ];
+                if (!empty($replyAudio)) {
+                    $payloadData['audio'] = $replyAudio;
+                } else {
+                    $payloadData['message'] = $replyText;
+                }
+                $payload = json_encode($payloadData);

                $ch = curl_init($sendUrl);
                curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
@@ -450,7 +464,7 @@ class WhatsAppController extends BaseController
                    'session_id' => $session['id'],
                    'contact_phone' => $msgData['phone'],
                    'direction' => 'outbound',
-                    'message_type' => 'text',
+                    'message_type' => !empty($replyAudio) ? 'audio' : 'text',
                    'message_body' => $replyText,
                    'whatsapp_message_id' => $waMsgId,
                    'status' => $status,
--- a/backend/app/Services/GeminiService.php
+++ b/backend/app/Services/GeminiService.php
@@ -212,4 +212,140 @@ class GeminiService
        $data = json_decode($response, true);
        return $data['candidates'][0]['content']['parts'][0]['text'] ?? null;
    }
+
+    /**
+     * Call Gemini API to generate a native audio (speech) response from text
+     */
+    public static function generateAudioResponse(string $apiKey, string $systemPrompt, string $userMessage, string $voiceName = 'Puck'): ?array
+    {
+        $url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-lite-latest:generateContent?key=' . $apiKey;
+
+        $payload = json_encode([
+            'contents' => [
+                [
+                    'role' => 'user',
+                    'parts' => [
+                        ['text' => $userMessage]
+                    ]
+                ]
+            ],
+            'systemInstruction' => [
+                'parts' => [
+                    ['text' => $systemPrompt]
+                ]
+            ],
+            'generationConfig' => [
+                'responseModalities' => ['AUDIO'],
+                'speechConfig' => [
+                    'voiceConfig' => [
+                        'prebuiltVoiceConfig' => [
+                            'voiceName' => $voiceName
+                        ]
+                    ]
+                ]
+            ]
+        ]);
+
+        $ch = curl_init($url);
+        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
+        curl_setopt($ch, CURLOPT_POST, true);
+        curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
+        curl_setopt($ch, CURLOPT_HTTPHEADER, [
+            'Content-Type: application/json'
+        ]);
+        curl_setopt($ch, CURLOPT_TIMEOUT, 30);
+
+        $response = curl_exec($ch);
+        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+        curl_close($ch);
+
+        if ($httpCode !== 200) {
+            error_log("[Gemini Audio API Error] HTTP " . $httpCode . " | Response: " . $response);
+            return null;
+        }
+
+        $data = json_decode($response, true);
+        $part = $data['candidates'][0]['content']['parts'][0] ?? null;
+        if ($part && isset($part['inlineData'])) {
+            return [
+                'audio' => $part['inlineData']['data'],
+                'mimeType' => $part['inlineData']['mimeType'] ?? 'audio/mp4'
+            ];
+        }
+        return null;
+    }
+
+    /**
+     * Call Gemini API with audio inline data to generate a native audio response
+     */
+    public static function generateAudioResponseFromAudio(string $apiKey, string $systemPrompt, string $audioBase64, string $mimeType, string $voiceName = 'Puck'): ?array
+    {
+        $url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-flash-lite-latest:generateContent?key=' . $apiKey;
+
+        if (strpos($mimeType, ';') !== false) {
+            $mimeType = trim(explode(';', $mimeType)[0]);
+        }
+
+        $payload = json_encode([
+            'contents' => [
+                [
+                    'role' => 'user',
+                    'parts' => [
+                        [
+                            'inlineData' => [
+                                'mimeType' => $mimeType,
+                                'data' => $audioBase64
+                            ]
+                        ],
+                        [
+                            'text' => "استمع إلى التسجيل الصوتي المرفق وأجب عليه مباشرة بصوتك بناءً على الإرشادات المحددة."
+                        ]
+                    ]
+                ]
+            ],
+            'systemInstruction' => [
+                'parts' => [
+                    ['text' => $systemPrompt]
+                ]
+            ],
+            'generationConfig' => [
+                'responseModalities' => ['AUDIO'],
+                'speechConfig' => [
+                    'voiceConfig' => [
+                        'prebuiltVoiceConfig' => [
+                            'voiceName' => $voiceName
+                        ]
+                    ]
+                ]
+            ]
+        ]);
+
+        $ch = curl_init($url);
+        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
+        curl_setopt($ch, CURLOPT_POST, true);
+        curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
+        curl_setopt($ch, CURLOPT_HTTPHEADER, [
+            'Content-Type: application/json'
+        ]);
+        curl_setopt($ch, CURLOPT_TIMEOUT, 45); // 45 seconds timeout for audio-to-audio generation
+
+        $response = curl_exec($ch);
+        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+        curl_close($ch);
+
+        if ($httpCode !== 200) {
+            error_log("[Gemini Audio-to-Audio Response Error] HTTP " . $httpCode . " | Response: " . $response);
+            return null;
+        }
+
+        $data = json_decode($response, true);
+        $part = $data['candidates'][0]['content']['parts'][0] ?? null;
+        if ($part && isset($part['inlineData'])) {
+            return [
+                'audio' => $part['inlineData']['data'],
+                'mimeType' => $part['inlineData']['mimeType'] ?? 'audio/mp4'
+            ];
+        }
+        return null;
+    }
 }