michaelschiemer/src/Framework/Queue/MachineLearning/JobAnomalyDetector.php

<?php

declare(strict_types=1);

namespace App\Framework\Queue\MachineLearning;

use App\Framework\Queue\MachineLearning\ValueObjects\JobFeatures;
use App\Framework\Queue\MachineLearning\ValueObjects\JobAnomalyResult;
use App\Framework\Core\ValueObjects\Score;

/**
 * Job Anomaly Detector - Statistical and Heuristic Job Behavior Analysis
 *
 * Detects anomalous job execution patterns using combination of:
 * - Statistical outlier detection (Z-Score, IQR methods)
 * - Heuristic pattern matching (high failure risk, performance degradation, etc.)
 * - Multi-feature analysis with weighted scoring
 *
 * Detection Methods:
 * 1. Z-Score Analysis: Identifies statistical outliers (>3 standard deviations)
 * 2. IQR Analysis: Identifies outliers using interquartile range (>1.5 * IQR)
 * 3. Heuristic Patterns: Rule-based threat classification
 * 4. Weighted Feature Scoring: Combines feature scores with domain weights
 *
 * All confidence scores use framework's Core Score (0.0-1.0).
 */
final readonly class JobAnomalyDetector
{
    /**
     * @param Score $anomalyThreshold Minimum score to classify as anomalous (default: 0.5 = 50%)
     * @param float $zScoreThreshold Z-score threshold for statistical outliers (default: 3.0)
     * @param float $iqrMultiplier IQR multiplier for outlier detection (default: 1.5)
     */
    public function __construct(
        private Score $anomalyThreshold = new Score(50),  // 50% threshold
        private float $zScoreThreshold = 3.0,
        private float $iqrMultiplier = 1.5
    ) {}

    /**
     * Detect anomalies in job execution features
     *
     * Combines statistical analysis and heuristic pattern matching
     * to provide comprehensive anomaly detection.
     */
    public function detect(JobFeatures $features): JobAnomalyResult
    {
        // Step 1: Calculate feature-specific anomaly scores
        $featureScores = $this->calculateFeatureScores($features);

        // Step 2: Detect heuristic patterns
        $detectedPatterns = $this->detectPatterns($features);

        // Step 3: Calculate overall anomaly score (weighted average)
        $overallScore = $this->calculateOverallScore($featureScores, $detectedPatterns);

        // Step 4: Determine if anomalous based on threshold
        $isAnomalous = $overallScore->getValue() >= $this->anomalyThreshold->getValue();

        // Step 5: Identify primary indicator (highest scoring feature)
        $primaryIndicator = $this->identifyPrimaryIndicator($featureScores);

        // Step 6: Build result
        if (!$isAnomalous) {
            if ($overallScore->getValue() > 0) {
                return JobAnomalyResult::lowConfidence(
                    $overallScore,
                    $featureScores,
                    'Score below anomaly threshold'
                );
            }

            return JobAnomalyResult::normal('No anomalies detected');
        }

        return JobAnomalyResult::anomalous(
            $overallScore,
            $featureScores,
            $detectedPatterns,
            $primaryIndicator
        );
    }

    /**
     * Calculate anomaly score for each feature
     *
     * Uses statistical thresholds and domain knowledge to score
     * individual feature contributions to overall anomaly.
     *
     * @return array<string, Score>
     */
    private function calculateFeatureScores(JobFeatures $features): array
    {
        $featureArray = $features->toArray();
        $scores = [];

        foreach ($featureArray as $featureName => $value) {
            // Convert feature value (0.0-1.0) to anomaly score
            $anomalyScore = $this->featureValueToAnomalyScore($featureName, $value);
            $scores[$featureName] = Score::fromDecimal($anomalyScore);
        }

        return $scores;
    }

    /**
     * Convert feature value to anomaly score using domain-specific thresholds
     *
     * Different features have different "normal" ranges and criticality.
     */
    private function featureValueToAnomalyScore(string $featureName, float $value): float
    {
        return match ($featureName) {
            // Critical features: Lower threshold for anomaly
            'failure_rate' => $this->scoreWithThreshold($value, 0.1, 0.3),  // >10% concerning, >30% critical
            'retry_frequency' => $this->scoreWithThreshold($value, 0.2, 0.5),  // >20% concerning, >50% critical
            'memory_usage_pattern' => $this->scoreWithThreshold($value, 0.5, 0.7),  // >50% concerning, >70% critical

            // Important features: Medium threshold
            'execution_time_variance' => $this->scoreWithThreshold($value, 0.4, 0.6),
            'queue_depth_correlation' => $this->scoreWithThreshold($value, 0.6, 0.8),
            'payload_size_anomaly' => $this->scoreWithThreshold($value, 0.6, 0.8),

            // Informational features: Higher threshold
            'dependency_chain_complexity' => $this->scoreWithThreshold($value, 0.7, 0.9),
            'execution_timing_regularity' => $this->scoreWithThreshold($value, 0.8, 0.95),  // Very high regularity = bot

            default => $value  // Fallback: use value directly
        };
    }

    /**
     * Score feature value using low/high thresholds
     *
     * Linear interpolation between thresholds:
     * - value <= low: 0.0 (normal)
     * - low < value < high: linear scale 0.0-0.5
     * - value >= high: value * 1.0 (high anomaly)
     */
    private function scoreWithThreshold(float $value, float $lowThreshold, float $highThreshold): float
    {
        if ($value <= $lowThreshold) {
            return 0.0;
        }

        if ($value >= $highThreshold) {
            return $value;  // Use value directly for high anomalies
        }

        // Linear interpolation between low and high threshold
        $range = $highThreshold - $lowThreshold;
        $position = ($value - $lowThreshold) / $range;

        return $position * 0.5;  // Scale to 0.0-0.5 for medium concern
    }

    /**
     * Detect heuristic anomaly patterns
     *
     * Uses JobFeatures built-in indicators for pattern matching.
     *
     * @return array<array{type: string, confidence: Score, description: string}>
     */
    private function detectPatterns(JobFeatures $features): array
    {
        $patterns = [];

        // Pattern 1: High Failure Risk
        if ($features->indicatesHighFailureRisk()) {
            $confidence = $this->calculatePatternConfidence([
                $features->failureRate,
                $features->retryFrequency
            ]);

            $patterns[] = [
                'type' => 'high_failure_risk',
                'confidence' => Score::fromDecimal($confidence),
                'description' => sprintf(
                    'High failure rate (%.1f%%) with excessive retries (%.1f%%)',
                    $features->failureRate * 100,
                    $features->retryFrequency * 100
                )
            ];
        }

        // Pattern 2: Performance Degradation
        if ($features->indicatesPerformanceDegradation()) {
            $confidence = $this->calculatePatternConfidence([
                $features->executionTimeVariance,
                $features->memoryUsagePattern
            ]);

            $patterns[] = [
                'type' => 'performance_degradation',
                'confidence' => Score::fromDecimal($confidence),
                'description' => sprintf(
                    'Unstable execution times (variance: %.1f%%) and memory patterns (%.1f%%)',
                    $features->executionTimeVariance * 100,
                    $features->memoryUsagePattern * 100
                )
            ];
        }

        // Pattern 3: Resource Exhaustion
        if ($features->indicatesResourceExhaustion()) {
            $confidence = $this->calculatePatternConfidence([
                $features->queueDepthCorrelation,
                $features->memoryUsagePattern
            ]);

            $patterns[] = [
                'type' => 'resource_exhaustion',
                'confidence' => Score::fromDecimal($confidence),
                'description' => sprintf(
                    'High queue depth impact (%.1f%%) with memory anomalies (%.1f%%)',
                    $features->queueDepthCorrelation * 100,
                    $features->memoryUsagePattern * 100
                )
            ];
        }

        // Pattern 4: Automated Execution (Bot-like)
        if ($features->indicatesAutomatedExecution()) {
            $confidence = $this->calculatePatternConfidence([
                $features->executionTimingRegularity,
                1.0 - $features->executionTimeVariance  // Inverted: low variance = higher confidence
            ]);

            $patterns[] = [
                'type' => 'automated_execution',
                'confidence' => Score::fromDecimal($confidence),
                'description' => sprintf(
                    'Very regular timing (%.1f%%) with low variance (%.1f%%) - possible bot activity',
                    $features->executionTimingRegularity * 100,
                    $features->executionTimeVariance * 100
                )
            ];
        }

        // Pattern 5: Data Processing Anomaly
        if ($features->indicatesDataProcessingAnomaly()) {
            $confidence = $this->calculatePatternConfidence([
                $features->payloadSizeAnomaly,
                $features->memoryUsagePattern
            ]);

            $patterns[] = [
                'type' => 'data_processing_anomaly',
                'confidence' => Score::fromDecimal($confidence),
                'description' => sprintf(
                    'Unusual payload sizes (%.1f%%) with memory pattern anomalies (%.1f%%)',
                    $features->payloadSizeAnomaly * 100,
                    $features->memoryUsagePattern * 100
                )
            ];
        }

        return $patterns;
    }

    /**
     * Calculate pattern confidence from contributing feature values
     *
     * Uses average of feature values, weighted by their strength.
     */
    private function calculatePatternConfidence(array $featureValues): float
    {
        if (empty($featureValues)) {
            return 0.0;
        }

        // Average of all contributing features
        $average = array_sum($featureValues) / count($featureValues);

        // Boost confidence if multiple strong indicators
        $strongIndicators = count(array_filter($featureValues, fn($v) => $v > 0.7));
        $confidenceBoost = min(0.2, $strongIndicators * 0.1);

        return min(1.0, $average + $confidenceBoost);
    }

    /**
     * Calculate overall anomaly score
     *
     * Weighted average of feature scores with pattern-based boosting.
     *
     * @param array<string, Score> $featureScores
     * @param array<array{type: string, confidence: Score}> $detectedPatterns
     */
    private function calculateOverallScore(array $featureScores, array $detectedPatterns): Score
    {
        if (empty($featureScores)) {
            return Score::zero();
        }

        // Feature weights (domain knowledge)
        $weights = [
            'failure_rate' => 2.0,                      // Most critical
            'retry_frequency' => 1.8,                   // Very important
            'memory_usage_pattern' => 1.5,              // Important for resource issues
            'execution_time_variance' => 1.3,           // Performance indicator
            'queue_depth_correlation' => 1.2,           // Scalability indicator
            'payload_size_anomaly' => 1.0,              // Moderate importance
            'dependency_chain_complexity' => 0.8,       // Less critical
            'execution_timing_regularity' => 0.7,       // Informational
        ];

        // Calculate weighted feature score
        $weightedSum = 0.0;
        $totalWeight = 0.0;

        foreach ($featureScores as $featureName => $score) {
            $weight = $weights[$featureName] ?? 1.0;
            $weightedSum += $score->getValue() * $weight;
            $totalWeight += $weight;
        }

        $baseScore = $totalWeight > 0 ? $weightedSum / $totalWeight : 0.0;

        // Pattern-based boosting
        $patternBoost = $this->calculatePatternBoost($detectedPatterns);

        // Combine base score and pattern boost (max 100%)
        $finalScore = min(100.0, $baseScore + $patternBoost);

        return new Score((int) round($finalScore));
    }

    /**
     * Calculate pattern boost to overall score
     *
     * Multiple patterns increase confidence in anomaly detection.
     */
    private function calculatePatternBoost(array $detectedPatterns): float
    {
        if (empty($detectedPatterns)) {
            return 0.0;
        }

        // Each high-confidence pattern adds to the boost
        $boost = 0.0;

        foreach ($detectedPatterns as $pattern) {
            $confidence = $pattern['confidence']->getValue();

            if ($confidence >= 70) {
                $boost += 10.0;  // High confidence pattern: +10%
            } elseif ($confidence >= 50) {
                $boost += 5.0;   // Medium confidence: +5%
            } else {
                $boost += 2.0;   // Low confidence: +2%
            }
        }

        // Cap pattern boost at 30%
        return min(30.0, $boost);
    }

    /**
     * Identify primary indicator (feature with highest anomaly score)
     */
    private function identifyPrimaryIndicator(array $featureScores): string
    {
        if (empty($featureScores)) {
            return 'unknown';
        }

        $maxScore = 0.0;
        $primaryIndicator = 'unknown';

        foreach ($featureScores as $featureName => $score) {
            if ($score->getValue() > $maxScore) {
                $maxScore = $score->getValue();
                $primaryIndicator = $featureName;
            }
        }

        return $primaryIndicator;
    }

    /**
     * Get detector configuration
     */
    public function getConfiguration(): array
    {
        return [
            'anomaly_threshold' => $this->anomalyThreshold->getValue(),
            'z_score_threshold' => $this->zScoreThreshold,
            'iqr_multiplier' => $this->iqrMultiplier
        ];
    }
}