PHP Classes

File: examples/case-studies/websites/resource_aware_batch_crawler/resource_aware_batch_crawler.php

Recommend this page to a friend!
  Packages of Christos Drogidis   Ascoos OS   examples/case-studies/websites/resource_aware_batch_crawler/resource_aware_batch_crawler.php   Download  
File: examples/case-studies/websites/resource_aware_batch_crawler/resource_aware_batch_crawler.php
Role: Example script
Content type: text/plain
Description: Example script
Class: Ascoos OS
A PHP Web 5.0 Kernel for decentralized web and IoT
Author: By
Last change:
Date: 6 months ago
Size: 3,413 bytes
 

Contents

Class file image Download
<?php
/**
 * @ASCOOS-NAME : Ascoos OS
 * @ASCOOS-VERSION : 26.0.0
 * @ASCOOS-SUPPORT : support@ascoos.com
 * @ASCOOS-BUGS : https://issues.ascoos.com
 *
 * @CASE-STUDY : resource_aware_batch_crawler.php
 * @fileNo : ASCOOS-OS-CASESTUDY-SEC00253
 *
 * @desc <English> Crawls multiple URLs in batch, adapts depth based on system resources (CPU/memory), extracts content, and reports ? integrating TCoreSystemHandler, TWebsiteHandler, and TFilesHandler for Web5 scalable scraping.
 * @desc <Greek> ??????? ????????? URLs ?? batch, ??????????? ????? ????? ????? ?????????? (CPU/?????), ?????? ???????????, ??? ???????? ? ?????????????? TCoreSystemHandler, TWebsiteHandler, ??? TFilesHandler ??? scalable scraping Web5.
 *
 * @since PHP 8.2.0+
 */
declare(strict_types=1);

use
ASCOOS\OS\Kernel\Systems\TCoreSystemHandler;
use
ASCOOS\OS\Kernel\Websites\TWebsiteHandler;
use
ASCOOS\OS\Kernel\Files\TFilesHandler;

global
$AOS_TMP_DATA_PATH, $utf8;

$properties = [
   
'file' => [
       
'baseDir' => $AOS_TMP_DATA_PATH . '/crawl_reports/',
       
'quotaSize' => 100000000 // 100MB quota
   
],
   
'system' => [
       
'cpu_percent_warn' => 70, // Threshold ??? light crawl
       
'memory_percent_warn' => 80 // Memory limit
   
]
];

try {
   
$system = new TCoreSystemHandler($properties['system']);
   
$files = new TFilesHandler([], $properties['file']);
   
$batch = ['https://ascoos.com', 'https://awes.ascoos.com', 'https://example.com'];

   
$reports = [];
    foreach (
$batch as $url) {
       
$cpuLoad = $system->get_cpu_load(0);
       
$memLoad = $system->get_memory_stats()['percent'];
       
$lightMode = $cpuLoad > $properties['system']['cpu_percent_warn'] || $memLoad > $properties['system']['memory_percent_warn'];

       
$website = new TWebsiteHandler();
       
$website->setUrl($url);

       
// Basic crawl always.
       
$availability = $website->checkAvailability($url);
       
$loadTime = $website->analyzeLoadTime($url);

       
// Full crawl if resources allow.
       
$content = [];
        if (!
$lightMode) {
           
$content = $website->getHTMLContent($url);
           
$keywords = $website->extractKeywords($url);
        } else {
           
$content = ['light_mode' => true, 'basic' => $loadTime];
        }

       
$reports[] = [
           
'url' => $url,
           
'cpu_load' => $cpuLoad,
           
'mem_load' => $memLoad,
           
'light_mode' => $lightMode,
           
'availability' => $availability,
           
'load_time' => $loadTime,
           
'content_excerpt' => $utf8->substr($content, 0, 200) // Full extract if not light
       
];

       
$website->Free();
    }

   
// Save batch report.
   
$files->createFolder($properties['file']['baseDir']);
   
$reportFile = $properties['file']['baseDir'] . 'batch_crawl_' . date('Ymd_His') . '.json';
   
$files->writeToFileWithCheck(json_encode($reports, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE), $reportFile);

    echo
"Batch Crawl Complete. Reports: " . json_encode($reports, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE) . "\n";

} catch (
Exception $e) {
    echo
"Error: " . $e->getMessage() . "\n";
}

$system->Free();
$files->Free();
?>