Add complete ScoutIQ system: Crawler (RSS+AI), CRUD Controllers (Organizations, Contacts, Opportunities, Sources), dynamic Views, API routes, CLI collector
This commit is contained in:
245
app/Services/Crawler/AiAnalyzer.php
Normal file
245
app/Services/Crawler/AiAnalyzer.php
Normal file
@@ -0,0 +1,245 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\Crawler;
|
||||
|
||||
use Throwable;
|
||||
|
||||
class AiAnalyzer
|
||||
{
|
||||
private ?string $apiKey;
|
||||
private string $model;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$config = require __DIR__ . '/../../../config/ai.php';
|
||||
$this->apiKey = $config['gemini']['api_key'] ?? null;
|
||||
$this->model = $config['gemini']['model'] ?? 'gemini-1.5-flash-latest';
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze text using Google Gemini AI to classify and extract info.
|
||||
* Returns: type, score, tags, is_opportunity, summary
|
||||
*/
|
||||
public function analyze(string $title, string $description): array
|
||||
{
|
||||
if (!$this->apiKey) {
|
||||
return $this->fallbackAnalysis($title, $description);
|
||||
}
|
||||
|
||||
$prompt = <<<PROMPT
|
||||
You are ScoutIQ, an investor intelligence AI. Analyze the following startup/investment content and return a JSON object with:
|
||||
- "type": one of ["grant", "competition", "demo_day", "event", "partnership", "investment", "news", "other"]
|
||||
- "opportunity_type": one of ["vc_funding", "accelerator", "incubator", "grant", "competition", "demo_day", "event", "partnership", "other"]
|
||||
- "score": integer 0-100 (relevance to startups seeking funding)
|
||||
- "tags": array of relevant tags (max 5)
|
||||
- "is_opportunity": boolean (true if it's a funding/investment opportunity)
|
||||
- "summary": 1-2 sentence summary of what this is
|
||||
- "organization_name": extracted organization name if any, or null
|
||||
- "country": extracted country if any, or null
|
||||
|
||||
Title: {$title}
|
||||
Description: {$description}
|
||||
|
||||
Respond ONLY with valid JSON, no markdown, no code fences.
|
||||
PROMPT;
|
||||
|
||||
try {
|
||||
$response = $this->callGemini($prompt);
|
||||
$json = json_decode($response, true);
|
||||
if (json_last_error() === JSON_ERROR_NONE && isset($json['type'])) {
|
||||
return $json;
|
||||
}
|
||||
} catch (Throwable $e) {
|
||||
// Fallback
|
||||
}
|
||||
|
||||
return $this->fallbackAnalysis($title, $description);
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze content for organization/investor extraction.
|
||||
*/
|
||||
public function extractOrganization(string $text): array
|
||||
{
|
||||
if (!$this->apiKey) {
|
||||
return [
|
||||
'name' => null,
|
||||
'type' => null,
|
||||
'country' => null,
|
||||
'website' => null,
|
||||
'description' => substr($text, 0, 500),
|
||||
];
|
||||
}
|
||||
|
||||
$prompt = <<<PROMPT
|
||||
Extract organization/investor information from this text. Return JSON:
|
||||
- "name": organization name or null
|
||||
- "type": one of ["vc", "angel", "accelerator", "incubator", "venture_studio", "partner", "other"] or null
|
||||
- "country": country name or null
|
||||
- "website": website URL or null
|
||||
- "description": brief description max 200 chars
|
||||
|
||||
Text: {$text}
|
||||
|
||||
Respond ONLY with valid JSON.
|
||||
PROMPT;
|
||||
|
||||
try {
|
||||
$response = $this->callGemini($prompt);
|
||||
$json = json_decode($response, true);
|
||||
if (json_last_error() === JSON_ERROR_NONE) {
|
||||
return $json;
|
||||
}
|
||||
} catch (Throwable $e) {
|
||||
// Fallback
|
||||
}
|
||||
|
||||
return [
|
||||
'name' => null,
|
||||
'type' => null,
|
||||
'country' => null,
|
||||
'website' => null,
|
||||
'description' => substr($text, 0, 500),
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Call Gemini API.
|
||||
*/
|
||||
private function callGemini(string $prompt): string
|
||||
{
|
||||
$url = "https://generativelanguage.googleapis.com/v1beta/models/{$this->model}:generateContent?key={$this->apiKey}";
|
||||
|
||||
$payload = json_encode([
|
||||
'contents' => [
|
||||
[
|
||||
'parts' => [
|
||||
['text' => $prompt]
|
||||
]
|
||||
]
|
||||
],
|
||||
'generationConfig' => [
|
||||
'temperature' => 0.2,
|
||||
'maxOutputTokens' => 500,
|
||||
]
|
||||
]);
|
||||
|
||||
$context = stream_context_create([
|
||||
'http' => [
|
||||
'method' => 'POST',
|
||||
'header' => "Content-Type: application/json\r\n",
|
||||
'content' => $payload,
|
||||
'timeout' => 30,
|
||||
],
|
||||
'ssl' => [
|
||||
'verify_peer' => false,
|
||||
'verify_peer_name' => false,
|
||||
],
|
||||
]);
|
||||
|
||||
$response = @file_get_contents($url, false, $context);
|
||||
if (!$response) {
|
||||
return '{}';
|
||||
}
|
||||
|
||||
$data = json_decode($response, true);
|
||||
return $data['candidates'][0]['content']['parts'][0]['text'] ?? '{}';
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple keyword-based fallback when AI is unavailable.
|
||||
*/
|
||||
private function fallbackAnalysis(string $title, string $description): array
|
||||
{
|
||||
$text = strtolower($title . ' ' . $description);
|
||||
|
||||
$type = 'news';
|
||||
$opportunityType = 'other';
|
||||
$score = 10;
|
||||
$tags = [];
|
||||
$isOpportunity = false;
|
||||
|
||||
// Keyword patterns
|
||||
if (preg_match('/\b(grant|funding|award|prize)\b/i', $text)) {
|
||||
$type = 'grant';
|
||||
$opportunityType = 'grant';
|
||||
$score = 75;
|
||||
$isOpportunity = true;
|
||||
$tags[] = 'grant';
|
||||
}
|
||||
if (preg_match('/\b(competition|contest|challenge|hackathon)\b/i', $text)) {
|
||||
$type = 'competition';
|
||||
$opportunityType = 'competition';
|
||||
$score = 65;
|
||||
$isOpportunity = true;
|
||||
$tags[] = 'competition';
|
||||
}
|
||||
if (preg_match('/\b(demo day|pitch day|investor day)\b/i', $text)) {
|
||||
$type = 'demo_day';
|
||||
$opportunityType = 'demo_day';
|
||||
$score = 60;
|
||||
$isOpportunity = true;
|
||||
$tags[] = 'demo_day';
|
||||
}
|
||||
if (preg_match('/\b(accelerator|incubator|venture studio)\b/i', $text)) {
|
||||
$opportunityType = 'accelerator';
|
||||
$score = 80;
|
||||
$isOpportunity = true;
|
||||
$tags[] = 'accelerator';
|
||||
$type = 'investment';
|
||||
}
|
||||
if (preg_match('/\b(vc|venture capital|seed fund|series [a-z])\b/i', $text)) {
|
||||
$opportunityType = 'vc_funding';
|
||||
$score = 85;
|
||||
$isOpportunity = true;
|
||||
$tags[] = 'vc_funding';
|
||||
$type = 'investment';
|
||||
}
|
||||
if (preg_match('/\b(partnership|collaboration|strategic alliance)\b/i', $text)) {
|
||||
$type = 'partnership';
|
||||
$opportunityType = 'partnership';
|
||||
$score = 50;
|
||||
$isOpportunity = true;
|
||||
$tags[] = 'partnership';
|
||||
}
|
||||
if (preg_match('/\b(conference|summit|meetup|webinar|workshop)\b/i', $text)) {
|
||||
$type = 'event';
|
||||
$opportunityType = 'event';
|
||||
$score = 40;
|
||||
$tags[] = 'event';
|
||||
}
|
||||
|
||||
// Industry tags
|
||||
if (preg_match('/\b(ai|artificial intelligence|machine learning|deep learning)\b/i', $text)) {
|
||||
$tags[] = 'ai';
|
||||
}
|
||||
if (preg_match('/\b(fintech|financial technology|blockchain|crypto)\b/i', $text)) {
|
||||
$tags[] = 'fintech';
|
||||
}
|
||||
if (preg_match('/\b(saas|software|cloud)\b/i', $text)) {
|
||||
$tags[] = 'saas';
|
||||
}
|
||||
if (preg_match('/\b(mobility|transportation|ev|electric vehicle|logistics)\b/i', $text)) {
|
||||
$tags[] = 'mobility';
|
||||
}
|
||||
if (preg_match('/\b(healthtech|healthcare|biotech|medtech)\b/i', $text)) {
|
||||
$tags[] = 'healthtech';
|
||||
}
|
||||
if (preg_match('/\b(climate|cleantech|sustainability|green energy|renewable)\b/i', $text)) {
|
||||
$tags[] = 'cleantech';
|
||||
}
|
||||
|
||||
$tags = array_unique($tags);
|
||||
|
||||
return [
|
||||
'type' => $type,
|
||||
'opportunity_type' => $opportunityType,
|
||||
'score' => $score,
|
||||
'tags' => $tags,
|
||||
'is_opportunity' => $isOpportunity,
|
||||
'summary' => substr($description, 0, 200),
|
||||
'organization_name' => null,
|
||||
'country' => null,
|
||||
];
|
||||
}
|
||||
}
|
||||
249
app/Services/Crawler/Collector.php
Normal file
249
app/Services/Crawler/Collector.php
Normal file
@@ -0,0 +1,249 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\Crawler;
|
||||
|
||||
use App\Services\Database\Connection;
|
||||
use App\Services\Database\ActivityLogger;
|
||||
use PDO;
|
||||
use Throwable;
|
||||
|
||||
class Collector
|
||||
{
|
||||
private PDO $pdo;
|
||||
private RssParser $rssParser;
|
||||
private AiAnalyzer $aiAnalyzer;
|
||||
private ActivityLogger $logger;
|
||||
|
||||
public function __construct(
|
||||
Connection $connection,
|
||||
RssParser $rssParser,
|
||||
AiAnalyzer $aiAnalyzer,
|
||||
ActivityLogger $logger
|
||||
) {
|
||||
$this->pdo = $connection->getPdo();
|
||||
$this->rssParser = $rssParser;
|
||||
$this->aiAnalyzer = $aiAnalyzer;
|
||||
$this->logger = $logger;
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect from all active sources.
|
||||
*/
|
||||
public function collectAll(): array
|
||||
{
|
||||
$results = [
|
||||
'total_sources' => 0,
|
||||
'processed' => 0,
|
||||
'errors' => 0,
|
||||
'new_opportunities' => 0,
|
||||
'new_organizations' => 0,
|
||||
'details' => [],
|
||||
];
|
||||
|
||||
$sources = $this->getActiveSources();
|
||||
|
||||
foreach ($sources as $source) {
|
||||
$results['total_sources']++;
|
||||
try {
|
||||
$result = $this->collectSource($source);
|
||||
$results['processed']++;
|
||||
$results['new_opportunities'] += $result['opportunities'];
|
||||
$results['new_organizations'] += $result['organizations'];
|
||||
$results['details'][] = [
|
||||
'source' => $source['name'],
|
||||
'type' => $source['type'],
|
||||
'status' => 'success',
|
||||
'entries_found' => $result['entries_found'],
|
||||
'new_opportunities' => $result['opportunities'],
|
||||
'new_organizations' => $result['organizations'],
|
||||
];
|
||||
} catch (Throwable $e) {
|
||||
$results['errors']++;
|
||||
$results['details'][] = [
|
||||
'source' => $source['name'],
|
||||
'type' => $source['type'],
|
||||
'status' => 'error',
|
||||
'error' => $e->getMessage(),
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
$this->logger->log(null, 'collector_run', 'Collector completed: ' . json_encode([
|
||||
'total_sources' => $results['total_sources'],
|
||||
'processed' => $results['processed'],
|
||||
'errors' => $results['errors'],
|
||||
'new_opportunities' => $results['new_opportunities'],
|
||||
'new_organizations' => $results['new_organizations'],
|
||||
]));
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect from a single source.
|
||||
*/
|
||||
public function collectSource(array $source): array
|
||||
{
|
||||
$result = [
|
||||
'entries_found' => 0,
|
||||
'opportunities' => 0,
|
||||
'organizations' => 0,
|
||||
];
|
||||
|
||||
if ($source['type'] === 'rss') {
|
||||
$entries = $this->rssParser->fetchEntries($source['url']);
|
||||
$result['entries_found'] = count($entries);
|
||||
|
||||
foreach ($entries as $entry) {
|
||||
$this->processEntry($entry, $source, $result);
|
||||
}
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a single entry: analyze, save opportunity, save organization.
|
||||
*/
|
||||
private function processEntry(array $entry, array $source, array &$result): void
|
||||
{
|
||||
// Skip if already exists
|
||||
if ($this->rssParser->entryExists($entry['url'])) {
|
||||
return;
|
||||
}
|
||||
|
||||
// AI Analysis
|
||||
$analysis = $this->aiAnalyzer->analyze($entry['title'], $entry['description']);
|
||||
|
||||
// Extract organization if any
|
||||
$orgId = null;
|
||||
if (!empty($analysis['organization_name'])) {
|
||||
$orgId = $this->rssParser->organizationExists($analysis['organization_name']);
|
||||
}
|
||||
|
||||
// If no org found and AI suggests one, try to extract more details
|
||||
if (!$orgId && !empty($analysis['organization_name'])) {
|
||||
$orgData = $this->aiAnalyzer->extractOrganization($entry['title'] . ' ' . $entry['description']);
|
||||
if (!empty($orgData['name'])) {
|
||||
$orgId = $this->createOrganization($orgData);
|
||||
if ($orgId) {
|
||||
$result['organizations']++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create opportunity
|
||||
$this->createOpportunity($entry, $analysis, $orgId, $source);
|
||||
$result['opportunities']++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an organization record.
|
||||
*/
|
||||
private function createOrganization(array $data): ?int
|
||||
{
|
||||
try {
|
||||
$stmt = $this->pdo->prepare(
|
||||
"INSERT INTO organizations (name, description, type, country, website_url, crm_status)
|
||||
VALUES (?, ?, ?, ?, ?, 'New')"
|
||||
);
|
||||
$stmt->execute([
|
||||
$data['name'],
|
||||
$data['description'] ?? '',
|
||||
$data['type'] ?? 'partner',
|
||||
$data['country'] ?? null,
|
||||
$data['website'] ?? null,
|
||||
]);
|
||||
return (int)$this->pdo->lastInsertId();
|
||||
} catch (Throwable $e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an opportunity record.
|
||||
*/
|
||||
private function createOpportunity(array $entry, array $analysis, ?int $orgId, array $source): void
|
||||
{
|
||||
try {
|
||||
$score = min(100, max(0, $analysis['score'] ?? 10));
|
||||
|
||||
$stmt = $this->pdo->prepare(
|
||||
"INSERT INTO opportunities (title, description, type, organization_id, url, status, score, raw_data)
|
||||
VALUES (?, ?, ?, ?, ?, 'active', ?, ?)"
|
||||
);
|
||||
$stmt->execute([
|
||||
$entry['title'],
|
||||
$analysis['summary'] ?? $entry['description'],
|
||||
$analysis['opportunity_type'] ?? $analysis['type'] ?? 'other',
|
||||
$orgId,
|
||||
$entry['url'],
|
||||
$score,
|
||||
json_encode([
|
||||
'source_id' => $source['id'] ?? null,
|
||||
'source_name' => $source['name'] ?? '',
|
||||
'published_at' => $entry['published_at'],
|
||||
'categories' => $entry['categories'] ?? [],
|
||||
'analysis' => $analysis,
|
||||
]),
|
||||
]);
|
||||
|
||||
$opportunityId = (int)$this->pdo->lastInsertId();
|
||||
|
||||
// Save tags
|
||||
if (!empty($analysis['tags'])) {
|
||||
foreach ($analysis['tags'] as $tagName) {
|
||||
$tagId = $this->getOrCreateTag($tagName);
|
||||
if ($tagId) {
|
||||
$stmt = $this->pdo->prepare(
|
||||
"INSERT IGNORE INTO opportunity_tags (opportunity_id, tag_id) VALUES (?, ?)"
|
||||
);
|
||||
$stmt->execute([$opportunityId, $tagId]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} catch (Throwable $e) {
|
||||
// Log but don't fail
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get or create a tag.
|
||||
*/
|
||||
private function getOrCreateTag(string $name): ?int
|
||||
{
|
||||
$slug = strtolower(preg_replace('/[^a-z0-9]+/', '-', $name));
|
||||
$slug = trim($slug, '-');
|
||||
|
||||
$stmt = $this->pdo->prepare("SELECT id FROM tags WHERE slug = ?");
|
||||
$stmt->execute([$slug]);
|
||||
$id = $stmt->fetchColumn();
|
||||
if ($id) {
|
||||
return (int)$id;
|
||||
}
|
||||
|
||||
try {
|
||||
$stmt = $this->pdo->prepare("INSERT INTO tags (name, slug) VALUES (?, ?)");
|
||||
$stmt->execute([$name, $slug]);
|
||||
return (int)$this->pdo->lastInsertId();
|
||||
} catch (Throwable $e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all active sources.
|
||||
*/
|
||||
public function getActiveSources(): array
|
||||
{
|
||||
$stmt = $this->pdo->query(
|
||||
"SELECT s.*, GROUP_CONCAT(sc.category) as categories
|
||||
FROM sources s
|
||||
LEFT JOIN source_categories sc ON sc.source_id = s.id
|
||||
WHERE s.status = 'active'
|
||||
GROUP BY s.id"
|
||||
);
|
||||
return $stmt->fetchAll() ?: [];
|
||||
}
|
||||
}
|
||||
105
app/Services/Crawler/RssParser.php
Normal file
105
app/Services/Crawler/RssParser.php
Normal file
@@ -0,0 +1,105 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\Crawler;
|
||||
|
||||
use App\Services\Database\Connection;
|
||||
use PDO;
|
||||
use Throwable;
|
||||
|
||||
class RssParser
|
||||
{
|
||||
private PDO $pdo;
|
||||
|
||||
public function __construct(Connection $connection)
|
||||
{
|
||||
$this->pdo = $connection->getPdo();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse an RSS feed URL and return entries.
|
||||
*/
|
||||
public function fetchEntries(string $url): array
|
||||
{
|
||||
$context = stream_context_create([
|
||||
'http' => [
|
||||
'timeout' => 15,
|
||||
'user_agent' => 'ScoutIQ/1.0 (Crawler)',
|
||||
],
|
||||
'ssl' => [
|
||||
'verify_peer' => false,
|
||||
'verify_peer_name' => false,
|
||||
],
|
||||
]);
|
||||
|
||||
$xml = @file_get_contents($url, false, $context);
|
||||
if (!$xml) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$feed = @simplexml_load_string($xml);
|
||||
if (!$feed) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$entries = [];
|
||||
$items = $feed->channel->item ?? $feed->entry ?? [];
|
||||
|
||||
foreach ($items as $item) {
|
||||
$title = (string)($item->title ?? '');
|
||||
$description = (string)($item->description ?? $item->summary ?? '');
|
||||
$link = (string)($item->link ?? $item->guid ?? '');
|
||||
$pubDate = (string)($item->pubDate ?? $item->updated ?? '');
|
||||
$categories = [];
|
||||
|
||||
if (isset($item->category)) {
|
||||
foreach ($item->category as $cat) {
|
||||
$categories[] = (string)$cat;
|
||||
}
|
||||
}
|
||||
|
||||
if (empty($title)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$entries[] = [
|
||||
'title' => $title,
|
||||
'description' => strip_tags($description),
|
||||
'url' => $link,
|
||||
'published_at' => $pubDate ? date('Y-m-d H:i:s', strtotime($pubDate)) : date('Y-m-d H:i:s'),
|
||||
'categories' => $categories,
|
||||
'source_raw' => $xml,
|
||||
];
|
||||
}
|
||||
|
||||
return $entries;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if entry URL already exists in opportunities.
|
||||
*/
|
||||
public function entryExists(string $url): bool
|
||||
{
|
||||
$stmt = $this->pdo->prepare("SELECT id FROM opportunities WHERE url = ? AND deleted_at IS NULL");
|
||||
$stmt->execute([$url]);
|
||||
return (bool)$stmt->fetch();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if organization already exists by domain or name.
|
||||
*/
|
||||
public function organizationExists(string $name, ?string $domain = null): ?int
|
||||
{
|
||||
if ($domain) {
|
||||
$stmt = $this->pdo->prepare("SELECT id FROM organizations WHERE domain = ? AND deleted_at IS NULL");
|
||||
$stmt->execute([$domain]);
|
||||
$id = $stmt->fetchColumn();
|
||||
if ($id) return (int)$id;
|
||||
}
|
||||
|
||||
// Fuzzy match by name
|
||||
$stmt = $this->pdo->prepare("SELECT id FROM organizations WHERE name LIKE ? AND deleted_at IS NULL LIMIT 1");
|
||||
$stmt->execute(['%' . $name . '%']);
|
||||
$id = $stmt->fetchColumn();
|
||||
return $id ? (int)$id : null;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user