130 lines
4.1 KiB
PHP
130 lines
4.1 KiB
PHP
<?php
|
|
|
|
namespace App\Services\Crawler;
|
|
|
|
use App\Services\Database\Connection;
|
|
use PDO;
|
|
use Throwable;
|
|
|
|
class RssParser
|
|
{
|
|
private PDO $pdo;
|
|
|
|
public function __construct(Connection $connection)
|
|
{
|
|
$this->pdo = $connection->getPdo();
|
|
}
|
|
|
|
/**
|
|
* Parse an RSS feed URL and return entries.
|
|
*/
|
|
public function fetchEntries(string $url): array
|
|
{
|
|
$xml = false;
|
|
if (function_exists('curl_init')) {
|
|
$ch = curl_init();
|
|
curl_setopt($ch, CURLOPT_URL, $url);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
|
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
|
curl_setopt($ch, CURLOPT_USERAGENT, 'ScoutIQ/1.0 (Crawler)');
|
|
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
|
curl_setopt($ch, CURLOPT_TIMEOUT, 15);
|
|
|
|
$xml = curl_exec($ch);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
if ($httpCode !== 200) {
|
|
throw new \Exception("HTTP Error {$httpCode}");
|
|
}
|
|
if ($xml === false) {
|
|
throw new \Exception("Connection failed via cURL");
|
|
}
|
|
} else {
|
|
$context = stream_context_create([
|
|
'http' => [
|
|
'timeout' => 15,
|
|
'user_agent' => 'ScoutIQ/1.0 (Crawler)',
|
|
],
|
|
'ssl' => [
|
|
'verify_peer' => false,
|
|
'verify_peer_name' => false,
|
|
],
|
|
]);
|
|
$xml = @file_get_contents($url, false, $context);
|
|
if ($xml === false) {
|
|
$status = "Connection failed";
|
|
if (isset($http_response_header) && isset($http_response_header[0])) {
|
|
$status = $http_response_header[0];
|
|
}
|
|
throw new \Exception($status);
|
|
}
|
|
}
|
|
|
|
$feed = @simplexml_load_string($xml);
|
|
if (!$feed) {
|
|
throw new \Exception("Invalid XML structure");
|
|
}
|
|
|
|
$entries = [];
|
|
$items = $feed->channel->item ?? $feed->entry ?? [];
|
|
|
|
foreach ($items as $item) {
|
|
$title = (string)($item->title ?? '');
|
|
$description = (string)($item->description ?? $item->summary ?? '');
|
|
$link = (string)($item->link ?? $item->guid ?? '');
|
|
$pubDate = (string)($item->pubDate ?? $item->updated ?? '');
|
|
$categories = [];
|
|
|
|
if (isset($item->category)) {
|
|
foreach ($item->category as $cat) {
|
|
$categories[] = (string)$cat;
|
|
}
|
|
}
|
|
|
|
if (empty($title)) {
|
|
continue;
|
|
}
|
|
|
|
$entries[] = [
|
|
'title' => $title,
|
|
'description' => strip_tags($description),
|
|
'url' => $link,
|
|
'published_at' => $pubDate ? date('Y-m-d H:i:s', strtotime($pubDate)) : date('Y-m-d H:i:s'),
|
|
'categories' => $categories,
|
|
'source_raw' => $xml,
|
|
];
|
|
}
|
|
|
|
return $entries;
|
|
}
|
|
|
|
/**
|
|
* Check if entry URL already exists in opportunities.
|
|
*/
|
|
public function entryExists(string $url): bool
|
|
{
|
|
$stmt = $this->pdo->prepare("SELECT id FROM opportunities WHERE url = ? AND deleted_at IS NULL");
|
|
$stmt->execute([$url]);
|
|
return (bool)$stmt->fetch();
|
|
}
|
|
|
|
/**
|
|
* Check if organization already exists by domain or name.
|
|
*/
|
|
public function organizationExists(string $name, ?string $domain = null): ?int
|
|
{
|
|
if ($domain) {
|
|
$stmt = $this->pdo->prepare("SELECT id FROM organizations WHERE domain = ? AND deleted_at IS NULL");
|
|
$stmt->execute([$domain]);
|
|
$id = $stmt->fetchColumn();
|
|
if ($id) return (int)$id;
|
|
}
|
|
|
|
// Fuzzy match by name
|
|
$stmt = $this->pdo->prepare("SELECT id FROM organizations WHERE name LIKE ? AND deleted_at IS NULL LIMIT 1");
|
|
$stmt->execute(['%' . $name . '%']);
|
|
$id = $stmt->fetchColumn();
|
|
return $id ? (int)$id : null;
|
|
}
|
|
} |