<?php
// Load the saved .txt file
$file = 'all_href_links.txt';

// Check if the file exists
if (file_exists($file)) {
    $content = file_get_contents($file);
    $allHrefLinks = json_decode($content, true); // Convert JSON string back to array

    echo "<pre>Processing Links...\n";

    // Function to fetch HTML content via cURL
    function fetchHTML($url) {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
        $html = curl_exec($ch);
        curl_close($ch);
        return $html;
    }

    // Function to extract data from a page
    function extractPageData($html) {
        $dom = new DOMDocument();
        libxml_use_internal_errors(true); // Suppress warnings
        $dom->loadHTML($html);
        libxml_clear_errors();

        $xpath = new DOMXPath($dom);

        // Check if the page contains the class 'sorting'
        if ($xpath->query("//div[contains(@class, 'sorting')]")->length > 0) {
            return []; // Skip this page if the 'sorting' class exists
        }

        $data = [];

        // Query elements with class 'product-image-link' and 'product-image'
        $productLinks = $xpath->query("//a[contains(@class, 'product-image-link')]");
        $productImages = $xpath->query("//img[contains(@class, 'product-image')]");

        foreach ($productLinks as $key => $link) {
            $data[$key]['href'] = $link->getAttribute('href');
        }
        foreach ($productImages as $key => $image) {
            $data[$key]['src'] = $image->getAttribute('src');
            $data[$key]['title'] = $image->getAttribute('alt'); // Use 'alt' as the title initially
        }

        return $data;
    }

    // Process each link in the array
    foreach ($allHrefLinks as &$link) {
        echo "Processing: $link\n";

        // Fetch HTML content for the page
        $html = fetchHTML($link);

        // Extract additional data from the page
        $pageData = extractPageData($html);

        // Add the extracted data as a subarray if data exists
        if (!empty($pageData)) {
            $link = [
                'url' => $link,
                'subarray' => $pageData,
            ];
        } else {
            $link = ['url' => $link]; // Keep the original link if no data found
        }
    }

    // Output the final data structure
    echo "<pre>";print_r($allHrefLinks);

    // Extract all 'href' elements from 'subarray' into a new array
    $allSubarrayLinks = [];

    foreach ($allHrefLinks as $parent) {
        if (isset($parent['subarray'])) {
            foreach ($parent['subarray'] as $child) {
                if (isset($child['href'])) {
                    // Prefix with base URL if not already present
                    $allSubarrayLinks[] = (strpos($child['href'], 'https://www.versagroup.be') === 0)
                        ? $child['href']
                        : 'https://www.versagroup.be' . $child['href'];
                }
            }
        }
    }

    // Output the extracted subarray href links
    echo "\nAll Subarray Href Links:\n";
    echo "<pre>";print_r($allSubarrayLinks); 

    // Save the updated data to the file
    $outputFile = 'updated_all_href_links.txt';
    file_put_contents($outputFile, json_encode($allSubarrayLinks));
    echo "All subarray links saved to $outputFile.\n";

        // Save the updated data back to the file
    
    
} else {
    echo "File $file not found.";
}
?>
