<?php
// generate_sitemap.php
// Usage: php generate_sitemap.php
// Requirements: PHP 7.0+, enable libxml (for DOMDocument)

// ------------- CONFIG ----------------
$startUrl = 'https://tripgoeson.com/';
$maxPages = 25000;              // safety limit of URLs to crawl
$sitemapUrlLimit = 45000;       // split when this many URLs in one sitemap (50k allowed by spec; use slightly less)
$outputDir = __DIR__ . '/sitemaps'; // directory where sitemap files will be saved
$userAgent = 'PHP Sitemap Generator';
$fetchTimeout = 15;             // curl timeout per request
// -------------------------------------

// increase runtime in case someone runs via web (CLI recommended)
@ini_set('memory_limit', '512M');
@set_time_limit(0);

// create output directory
if (!is_dir($outputDir)) {
    if (!mkdir($outputDir, 0755, true) && !is_dir($outputDir)) {
        die("Could not create output dir: $outputDir\n");
    }
}

// normalize start URL
$startUrl = normalizeUrl($startUrl);

// queue and visited set
$queue = new SplQueue();
$queue->enqueue($startUrl);
$visited = []; // associative set for O(1) lookups
$visited[$startUrl] = true;

$sitemapFiles = []; // list of sitemap filenames created
$currentSitemapIndex = 1;
$currentSitemapCount = 0;
$currentWriter = createSitemapWriter($outputDir, $currentSitemapIndex);
echo "Starting crawl: $startUrl\n";

while (!$queue->isEmpty() && count($visited) < $maxPages) {
    $currentUrl = $queue->dequeue();

    // skip non-html resources quickly (images, pdf, etc.)
    $ext = strtolower(pathinfo(parse_url($currentUrl, PHP_URL_PATH) ?? '', PATHINFO_EXTENSION));
    $nonHtmlExts = ['jpg','jpeg','png','gif','bmp','pdf','zip','rar','exe','svg','ico'];
    if (in_array($ext, $nonHtmlExts, true)) {
        continue;
    }

    $html = fetchHTML($currentUrl, $userAgent, $fetchTimeout);
    if ($html === false || $html === '') {
        // skip on failure
        continue;
    }

    // Write this URL to current sitemap
    writeUrlToSitemap($currentWriter, $currentUrl);
    $currentSitemapCount++;

    // rotate sitemap file if limit reached
    if ($currentSitemapCount >= $sitemapUrlLimit) {
        finalizeSitemapWriter($currentWriter);
        $sitemapFiles[] = basename($currentWriter['filename']);
        $currentSitemapIndex++;
        $currentWriter = createSitemapWriter($outputDir, $currentSitemapIndex);
        $currentSitemapCount = 0;
    }

    // parse links
    $links = getLinksFromHTML($html, $startUrl);
    foreach ($links as $link) {
        // normalize and ensure same host
        $linkNorm = normalizeUrl($link);
        if (!isSameSite($startUrl, $linkNorm)) continue;
        if (isset($visited[$linkNorm])) continue;
        // skip querystrings if desired -- your original skipped them
        if (strpos($linkNorm, '?') !== false) continue;

        $visited[$linkNorm] = true;
        $queue->enqueue($linkNorm);
        // stop early if maxPages reached
        if (count($visited) >= $maxPages) break 2;
    }
    // small sleep to be polite; comment out if you want max speed (but be careful)
    usleep(100000); // 100ms
}

// finalize last sitemap writer
finalizeSitemapWriter($currentWriter);
$sitemapFiles[] = basename($currentWriter['filename']);

// build sitemap index file
// Build sitemap index (save in ROOT)
$indexFilename = __DIR__ . '/sitemap.xml';

buildSitemapIndex(
    $indexFilename,
    $sitemapFiles,
    rtrim($startUrl, '/') . '/sitemaps'
);


echo "Crawl complete. Generated " . count($sitemapFiles) . " sitemap file(s).\n";
echo "Sitemap index: $indexFilename\n";

/* =======================
   Helper functions below
   ======================= */

function normalizeUrl($url) {
    $url = trim($url);
    if ($url === '') return $url;
    $parts = parse_url($url);
    if (!isset($parts['scheme'])) {
        $url = 'http://' . ltrim($url, '/');
        $parts = parse_url($url);
    }
    $scheme = strtolower($parts['scheme']);
    $host = strtolower($parts['host'] ?? '');
    $port = isset($parts['port']) ? ':' . $parts['port'] : '';
    $path = $parts['path'] ?? '/';
    // remove duplicate slashes in path
    $path = preg_replace('#//+#', '/', $path);
    // remove fragment
    $query = isset($parts['query']) ? '?' . $parts['query'] : '';
    // canonicalize: remove trailing slash except for root
    if ($path !== '/') {
        $path = rtrim($path, '/');
    }
    return $scheme . '://' . $host . $port . $path . $query;
}

function isSameSite($base, $url) {
    $pb = parse_url($base);
    $pu = parse_url($url);
    if (!isset($pb['host']) || !isset($pu['host'])) return false;
    return strtolower($pb['host']) === strtolower($pu['host']);
}

function fetchHTML($url, $userAgent, $timeout = 15) {
    $ch = curl_init($url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 6);
    curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
    // avoid certificate issues if necessary (not recommended for production)
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true);
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
    $body = curl_exec($ch);
    // http status code debug (optional)
    // $info = curl_getinfo($ch);
    curl_close($ch);
    return $body;
}

function getLinksFromHTML($html, $baseUrl) {
    $dom = new DOMDocument();
    libxml_use_internal_errors(true);
    $dom->loadHTML($html);
    libxml_clear_errors();
    $xpath = new DOMXPath($dom);
    $nodes = $xpath->query("//a[@href]");
    $set = [];
    foreach ($nodes as $node) {
        $href = trim($node->getAttribute('href'));
        $resolved = resolveURL($href, $baseUrl);
        if ($resolved === null) continue;
        // ignore mailto:, tel:, javascript:
        $parsed = parse_url($resolved);
        $scheme = isset($parsed['scheme']) ? strtolower($parsed['scheme']) : 'http';
        if (!in_array($scheme, ['http', 'https'], true)) continue;
        // dedupe
        $set[$resolved] = true;
    }
    return array_keys($set);
}

function resolveURL($relativeUrl, $baseUrl) {
    if ($relativeUrl === '' || $relativeUrl[0] === '#') return null;
    $r = parse_url($relativeUrl);
    if (isset($r['scheme'])) {
        // absolute url
        return $relativeUrl;
    }
    // protocol-relative
    if (strpos($relativeUrl, '//') === 0) {
        $pb = parse_url($baseUrl);
        $scheme = $pb['scheme'] ?? 'http';
        return $scheme . ':' . $relativeUrl;
    }
    // root-relative
    if (substr($relativeUrl, 0, 1) === '/') {
        $pb = parse_url($baseUrl);
        $scheme = $pb['scheme'] ?? 'http';
        $host = $pb['host'] ?? '';
        $port = isset($pb['port']) ? ':' . $pb['port'] : '';
        return $scheme . '://' . $host . $port . $relativeUrl;
    }
    // relative path
    $pb = parse_url($baseUrl);
    $basePath = $pb['path'] ?? '/';
    // remove filename from base path if present
    if (substr($basePath, -1) !== '/') {
        $basePath = dirname($basePath) . '/';
    }
    $full = $pb['scheme'] . '://' . $pb['host'] . $basePath . $relativeUrl;
    // canonicalize path
    $full = preg_replace('#/+#','/',$full);
    return $full;
}

/* ---------- Sitemap writer helpers ---------- */
function createSitemapWriter($dir, $index) {
    $filename = $dir . '/sitemap-' . $index . '.xml';
    $writer = new XMLWriter();
    $writer->openURI($filename);
    $writer->startDocument('1.0', 'UTF-8');
    $writer->setIndent(true);
    $writer->startElement('urlset');
    $writer->writeAttribute('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9');
    return ['writer' => $writer, 'filename' => $filename];
}

function writeUrlToSitemap($writerWrapper, $loc) {
    $writer = $writerWrapper['writer'];
    $writer->startElement('url');
    $writer->writeElement('loc', $loc);
    $writer->writeElement('changefreq', 'weekly');
    $writer->writeElement('priority', '0.8');
    $writer->endElement(); // url
}

function finalizeSitemapWriter($writerWrapper) {
    $writer = $writerWrapper['writer'];
    if ($writer) {
        $writer->endElement(); // urlset
        $writer->endDocument();
        $writer->flush();
    }
}

function buildSitemapIndex($indexFilePath, $sitemapFiles, $sitemapBaseUrl) {
    // sitemapBaseUrl should be the public URL path to the sitemaps directory
    $w = new XMLWriter();
    $w->openURI($indexFilePath);
    $w->startDocument('1.0','UTF-8');
    $w->setIndent(true);
    $w->startElement('sitemapindex');
    $w->writeAttribute('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9');

    foreach ($sitemapFiles as $f) {
        $w->startElement('sitemap');
        $w->writeElement('loc', rtrim($sitemapBaseUrl, '/') . '/' . $f);
        $w->writeElement('lastmod', date('c'));
        $w->endElement();
    }

    $w->endElement();
    $w->endDocument();
    $w->flush();
}
?>