From d05e18b59d9ff1a8a436fcf7de1c66322f06ea98 Mon Sep 17 00:00:00 2001 From: Hamza-Ayed Date: Fri, 5 Jun 2026 17:13:07 +0300 Subject: [PATCH] Deploy on 2026-06-05 17:13:07 --- app/Services/Crawler/AiAnalyzer.php | 11 +++ app/Services/Crawler/Collector.php | 148 +++++++++++++++++++++------- database/seeds/DatabaseSeeder.php | 2 + 3 files changed, 124 insertions(+), 37 deletions(-) diff --git a/app/Services/Crawler/AiAnalyzer.php b/app/Services/Crawler/AiAnalyzer.php index acc4492..405d378 100644 --- a/app/Services/Crawler/AiAnalyzer.php +++ b/app/Services/Crawler/AiAnalyzer.php @@ -8,6 +8,7 @@ class AiAnalyzer { private ?string $apiKey; private string $model; + private static ?float $lastCallTime = null; public function __construct() { @@ -108,6 +109,16 @@ PROMPT; */ private function callGemini(string $prompt): string { + if (self::$lastCallTime !== null) { + $elapsed = microtime(true) - self::$lastCallTime; + $minInterval = 4.5; // Space out requests to under 15 RPM + if ($elapsed < $minInterval) { + $sleepTime = $minInterval - $elapsed; + usleep((int)($sleepTime * 1000000)); + } + } + self::$lastCallTime = microtime(true); + $url = "https://generativelanguage.googleapis.com/v1beta/models/{$this->model}:generateContent?key={$this->apiKey}"; $payload = json_encode([ diff --git a/app/Services/Crawler/Collector.php b/app/Services/Crawler/Collector.php index c2585f3..4374aab 100644 --- a/app/Services/Crawler/Collector.php +++ b/app/Services/Crawler/Collector.php @@ -16,6 +16,10 @@ class Collector private ActivityLogger $logger; private TelegramNotifier $notifier; + private $lockHandle = null; + private int $maxNewPerFeed = 5; + private int $maxNewTotal = 100; + public function __construct( Connection $connection, RssParser $rssParser, @@ -35,6 +39,10 @@ class Collector */ public function collectAll(): array { + if (!$this->acquireLock()) { + throw new \Exception("Another collector run is currently in progress."); + } + $results = [ 'total_sources' => 0, 'processed' => 0, @@ -44,46 +52,66 @@ class Collector 'details' => [], ]; - $sources = $this->getActiveSources(); + try { + $this->maxNewPerFeed = (int)($this->getSetting('crawler_max_new_per_feed') ?: 5); + $this->maxNewTotal = (int)($this->getSetting('crawler_max_new_total') ?: 100); - foreach ($sources as $source) { - $results['total_sources']++; - try { - $result = $this->collectSource($source); - $results['processed']++; - $results['new_opportunities'] += $result['opportunities']; - $results['new_organizations'] += $result['organizations']; - $results['details'][] = [ - 'source' => $source['name'], - 'type' => $source['type'], - 'status' => 'success', - 'entries_found' => $result['entries_found'], - 'new_opportunities' => $result['opportunities'], - 'new_organizations' => $result['organizations'], - ]; - } catch (Throwable $e) { - $results['errors']++; - $results['details'][] = [ - 'source' => $source['name'], - 'type' => $source['type'], - 'status' => 'error', - 'error' => $e->getMessage(), - ]; + $sources = $this->getActiveSources(); + $totalNewProcessed = 0; + + foreach ($sources as $source) { + $results['total_sources']++; + + if ($totalNewProcessed >= $this->maxNewTotal) { + $results['details'][] = [ + 'source' => $source['name'], + 'type' => $source['type'], + 'status' => 'skipped', + 'reason' => 'Global limit reached', + ]; + continue; + } + + try { + $result = $this->collectSource($source, $totalNewProcessed); + $results['processed']++; + $results['new_opportunities'] += $result['opportunities']; + $results['new_organizations'] += $result['organizations']; + $results['details'][] = [ + 'source' => $source['name'], + 'type' => $source['type'], + 'status' => 'success', + 'entries_found' => $result['entries_found'], + 'new_opportunities' => $result['opportunities'], + 'new_organizations' => $result['organizations'], + ]; + } catch (Throwable $e) { + $results['errors']++; + $results['details'][] = [ + 'source' => $source['name'], + 'type' => $source['type'], + 'status' => 'error', + 'error' => $e->getMessage(), + ]; + } } - } - $this->logger->log(null, 'collector_run', 'Collector completed: ' . json_encode([ - 'total_sources' => $results['total_sources'], - 'processed' => $results['processed'], - 'errors' => $results['errors'], - 'new_opportunities' => $results['new_opportunities'], - 'new_organizations' => $results['new_organizations'], - ])); + $this->logger->log(null, 'collector_run', 'Collector completed: ' . json_encode([ + 'total_sources' => $results['total_sources'], + 'processed' => $results['processed'], + 'errors' => $results['errors'], + 'new_opportunities' => $results['new_opportunities'], + 'new_organizations' => $results['new_organizations'], + ])); - // Send Telegram notification if enabled - if ($this->getSetting('telegram_enabled') === '1') { - $this->notifier->loadSettings(); - $this->notifier->notifyCollectorResults($results); + // Send Telegram notification if enabled + if ($this->getSetting('telegram_enabled') === '1') { + $this->notifier->loadSettings(); + $this->notifier->notifyCollectorResults($results); + } + + } finally { + $this->releaseLock(); } return $results; @@ -92,7 +120,7 @@ class Collector /** * Collect from a single source. */ - public function collectSource(array $source): array + public function collectSource(array $source, int &$totalNewProcessed): array { $result = [ 'entries_found' => 0, @@ -104,14 +132,60 @@ class Collector $entries = $this->rssParser->fetchEntries($source['url']); $result['entries_found'] = count($entries); + $newEntriesInSource = 0; foreach ($entries as $entry) { + if ($totalNewProcessed >= $this->maxNewTotal) { + break; + } + + if ($this->rssParser->entryExists($entry['url'])) { + continue; + } + + if ($newEntriesInSource >= $this->maxNewPerFeed) { + break; // stop processing entries since they are sorted newest-first + } + $this->processEntry($entry, $source, $result); + $newEntriesInSource++; + $totalNewProcessed++; } } return $result; } + /** + * Acquire run lock. + */ + private function acquireLock(): bool + { + $lockFile = __DIR__ . '/../../../collector.lock'; + $this->lockHandle = @fopen($lockFile, 'c'); + if (!$this->lockHandle) { + return false; + } + if (!flock($this->lockHandle, LOCK_EX | LOCK_NB)) { + fclose($this->lockHandle); + $this->lockHandle = null; + return false; + } + return true; + } + + /** + * Release run lock. + */ + private function releaseLock(): void + { + if ($this->lockHandle) { + flock($this->lockHandle, LOCK_UN); + fclose($this->lockHandle); + @unlink(__DIR__ . '/../../../collector.lock'); + $this->lockHandle = null; + } + } + /** * Process a single entry: analyze, save opportunity, save organization. */ diff --git a/database/seeds/DatabaseSeeder.php b/database/seeds/DatabaseSeeder.php index 33b5e2b..e47d9d7 100644 --- a/database/seeds/DatabaseSeeder.php +++ b/database/seeds/DatabaseSeeder.php @@ -89,6 +89,8 @@ class DatabaseSeeder ['key' => 'system_email', 'value' => 'info@scoutiq.intaleqapp.com', 'description' => 'Primary sender email address.'], ['key' => 'crawler_enabled', 'value' => 'true', 'description' => 'Global toggle for data collection crawlers.'], ['key' => 'crawler_interval_hours', 'value' => '24', 'description' => 'Delay between crawler runs.'], + ['key' => 'crawler_max_new_per_feed', 'value' => '5', 'description' => 'Maximum new opportunities to process per feed in a single run.'], + ['key' => 'crawler_max_new_total', 'value' => '100', 'description' => 'Maximum total new opportunities to process in a single collector run.'], ]; foreach ($settings as $setting) {