Files
scoutiq/app/Services/Crawler/RssParser.php
2026-06-05 17:03:37 +03:00

130 lines
4.1 KiB
PHP

<?php
namespace App\Services\Crawler;
use App\Services\Database\Connection;
use PDO;
use Throwable;
class RssParser
{
private PDO $pdo;
public function __construct(Connection $connection)
{
$this->pdo = $connection->getPdo();
}
/**
* Parse an RSS feed URL and return entries.
*/
public function fetchEntries(string $url): array
{
$xml = false;
if (function_exists('curl_init')) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'ScoutIQ/1.0 (Crawler)');
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_TIMEOUT, 15);
$xml = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode !== 200) {
throw new \Exception("HTTP Error {$httpCode}");
}
if ($xml === false) {
throw new \Exception("Connection failed via cURL");
}
} else {
$context = stream_context_create([
'http' => [
'timeout' => 15,
'user_agent' => 'ScoutIQ/1.0 (Crawler)',
],
'ssl' => [
'verify_peer' => false,
'verify_peer_name' => false,
],
]);
$xml = @file_get_contents($url, false, $context);
if ($xml === false) {
$status = "Connection failed";
if (isset($http_response_header) && isset($http_response_header[0])) {
$status = $http_response_header[0];
}
throw new \Exception($status);
}
}
$feed = @simplexml_load_string($xml);
if (!$feed) {
throw new \Exception("Invalid XML structure");
}
$entries = [];
$items = $feed->channel->item ?? $feed->entry ?? [];
foreach ($items as $item) {
$title = (string)($item->title ?? '');
$description = (string)($item->description ?? $item->summary ?? '');
$link = (string)($item->link ?? $item->guid ?? '');
$pubDate = (string)($item->pubDate ?? $item->updated ?? '');
$categories = [];
if (isset($item->category)) {
foreach ($item->category as $cat) {
$categories[] = (string)$cat;
}
}
if (empty($title)) {
continue;
}
$entries[] = [
'title' => $title,
'description' => strip_tags($description),
'url' => $link,
'published_at' => $pubDate ? date('Y-m-d H:i:s', strtotime($pubDate)) : date('Y-m-d H:i:s'),
'categories' => $categories,
'source_raw' => $xml,
];
}
return $entries;
}
/**
* Check if entry URL already exists in opportunities.
*/
public function entryExists(string $url): bool
{
$stmt = $this->pdo->prepare("SELECT id FROM opportunities WHERE url = ? AND deleted_at IS NULL");
$stmt->execute([$url]);
return (bool)$stmt->fetch();
}
/**
* Check if organization already exists by domain or name.
*/
public function organizationExists(string $name, ?string $domain = null): ?int
{
if ($domain) {
$stmt = $this->pdo->prepare("SELECT id FROM organizations WHERE domain = ? AND deleted_at IS NULL");
$stmt->execute([$domain]);
$id = $stmt->fetchColumn();
if ($id) return (int)$id;
}
// Fuzzy match by name
$stmt = $this->pdo->prepare("SELECT id FROM organizations WHERE name LIKE ? AND deleted_at IS NULL LIMIT 1");
$stmt->execute(['%' . $name . '%']);
$id = $stmt->fetchColumn();
return $id ? (int)$id : null;
}
}