From 227fd7c412c8ace1dd0ca8895001e9168c3b9d97 Mon Sep 17 00:00:00 2001 From: Hamza-Ayed Date: Fri, 5 Jun 2026 17:03:37 +0300 Subject: [PATCH] Deploy on 2026-06-05 17:03:37 --- app/Services/Crawler/AiAnalyzer.php | 23 +++++++++++-- app/Services/Crawler/RssParser.php | 53 +++++++++++++++++++++-------- config/ai.php | 4 +-- 3 files changed, 62 insertions(+), 18 deletions(-) diff --git a/app/Services/Crawler/AiAnalyzer.php b/app/Services/Crawler/AiAnalyzer.php index 13aeacf..acc4492 100644 --- a/app/Services/Crawler/AiAnalyzer.php +++ b/app/Services/Crawler/AiAnalyzer.php @@ -158,6 +158,25 @@ PROMPT; $score = 30; $tags = []; $isOpportunity = true; // Treat all entries as opportunities by default + $orgName = null; + $country = null; + + // Smart Regex rules for extracting organization names from title + if (preg_match('/^([A-Z0-9][A-Za-z0-9\s\-\.\&]{2,40})\s+(raises|launches|secures|gets|partners|funded|acquires|announces|closes|receives|seeks)\b/i', $title, $matches)) { + $orgName = trim($matches[1]); + } elseif (preg_match('/(backs|invests in|funds|acquires)\s+([A-Z0-9][A-Za-z0-9\s\-\.\&]{2,40})/i', $title, $matches)) { + $orgName = trim($matches[2]); + } elseif (preg_match('/(investment in|funding for|launch of)\s+([A-Z0-9][A-Za-z0-9\s\-\.\&]{2,40})/i', $title, $matches)) { + $orgName = trim($matches[2]); + } + + if ($orgName) { + $orgName = preg_replace('/\b(series|seed|funding|round|raised|million|billion|capital|partners|ventures|inc|ltd|corp|co|llc)\b.*$/i', '', $orgName); + $orgName = trim($orgName, " \t\n\r\0\x0B,.-"); + if (strlen($orgName) < 2 || in_array(strtolower($orgName), ['startup', 'founder', 'investor', 'program', 'new', 'why', 'how', 'what', 'who', 'the'])) { + $orgName = null; + } + } // Keyword patterns if (preg_match('/\b(grant|funding|award|prize)\b/i', $text)) { @@ -238,8 +257,8 @@ PROMPT; 'tags' => $tags, 'is_opportunity' => $isOpportunity, 'summary' => substr($description, 0, 200), - 'organization_name' => null, - 'country' => null, + 'organization_name' => $orgName, + 'country' => $country, ]; } } \ No newline at end of file diff --git a/app/Services/Crawler/RssParser.php b/app/Services/Crawler/RssParser.php index 060c86d..b2fe9d1 100644 --- a/app/Services/Crawler/RssParser.php +++ b/app/Services/Crawler/RssParser.php @@ -20,25 +20,50 @@ class RssParser */ public function fetchEntries(string $url): array { - $context = stream_context_create([ - 'http' => [ - 'timeout' => 15, - 'user_agent' => 'ScoutIQ/1.0 (Crawler)', - ], - 'ssl' => [ - 'verify_peer' => false, - 'verify_peer_name' => false, - ], - ]); + $xml = false; + if (function_exists('curl_init')) { + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_USERAGENT, 'ScoutIQ/1.0 (Crawler)'); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + curl_setopt($ch, CURLOPT_TIMEOUT, 15); + + $xml = curl_exec($ch); + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); - $xml = @file_get_contents($url, false, $context); - if (!$xml) { - return []; + if ($httpCode !== 200) { + throw new \Exception("HTTP Error {$httpCode}"); + } + if ($xml === false) { + throw new \Exception("Connection failed via cURL"); + } + } else { + $context = stream_context_create([ + 'http' => [ + 'timeout' => 15, + 'user_agent' => 'ScoutIQ/1.0 (Crawler)', + ], + 'ssl' => [ + 'verify_peer' => false, + 'verify_peer_name' => false, + ], + ]); + $xml = @file_get_contents($url, false, $context); + if ($xml === false) { + $status = "Connection failed"; + if (isset($http_response_header) && isset($http_response_header[0])) { + $status = $http_response_header[0]; + } + throw new \Exception($status); + } } $feed = @simplexml_load_string($xml); if (!$feed) { - return []; + throw new \Exception("Invalid XML structure"); } $entries = []; diff --git a/config/ai.php b/config/ai.php index 6660c24..29094fe 100644 --- a/config/ai.php +++ b/config/ai.php @@ -2,11 +2,11 @@ return [ 'gemini' => [ - 'api_key' => ($_ENV['GEMINI_API_KEY'] === 'null' || !$_ENV['GEMINI_API_KEY']) ? null : $_ENV['GEMINI_API_KEY'], + 'api_key' => (getenv('GEMINI_API_KEY') ?: ($_SERVER['GEMINI_API_KEY'] ?? ($_ENV['GEMINI_API_KEY'] ?? null))), 'model' => 'gemini-flash-lite-latest', ], 'jwt' => [ - 'secret' => $_ENV['JWT_SECRET'] ?? 'base64:3uFzGf9o8+D+U0mC4/3K1y4m81Qj7G6qTzS=', + 'secret' => getenv('JWT_SECRET') ?: ($_SERVER['JWT_SECRET'] ?? ($_ENV['JWT_SECRET'] ?? 'base64:3uFzGf9o8+D+U0mC4/3K1y4m81Qj7G6qTzS=')), 'algorithm' => 'HS256', 'expires_in' => 86400 * 30, // 30 days ],