Files
michaelschiemer/src/Framework/Waf/MachineLearning/Extractors/PatternFeatureExtractor.php
Michael Schiemer 55a330b223 Enable Discovery debug logging for production troubleshooting
- Add DISCOVERY_LOG_LEVEL=debug
- Add DISCOVERY_SHOW_PROGRESS=true
- Temporary changes for debugging InitializerProcessor fixes on production
2025-08-11 20:13:26 +02:00

915 lines
27 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Framework\Waf\MachineLearning\Extractors;
use App\Framework\Waf\Analysis\ValueObjects\RequestAnalysisData;
use App\Framework\Waf\MachineLearning\BehaviorType;
use App\Framework\Waf\MachineLearning\FeatureExtractorInterface;
use App\Framework\Waf\MachineLearning\ValueObjects\BehaviorFeature;
/**
* Extracts behavioral patterns from URL paths, parameters, and request structure
*/
final class PatternFeatureExtractor implements FeatureExtractorInterface
{
public function __construct(
private readonly bool $enabled = true,
private readonly int $maxPathSegments = 20,
private readonly int $maxParameterKeys = 100,
private readonly int $historySize = 100,
private array $pathHistory = [],
private array $parameterHistory = []
) {
}
public function getBehaviorType(): BehaviorType
{
return BehaviorType::PATH_PATTERNS;
}
public function canExtract(RequestAnalysisData $requestData): bool
{
return ! empty($requestData->path);
}
public function extractFeatures(RequestAnalysisData $requestData, array $context = []): array
{
$clientId = $this->getClientId($requestData);
// Record current request patterns
$this->recordPatterns($clientId, $requestData);
$features = [];
// Path-based features
$features = array_merge($features, $this->extractPathFeatures($requestData, $clientId));
// Parameter-based features
$features = array_merge($features, $this->extractParameterFeatures($requestData, $clientId));
// Sequence-based features
$features = array_merge($features, $this->extractSequenceFeatures($requestData, $clientId));
// Structure-based features
$features = array_merge($features, $this->extractStructureFeatures($requestData));
return array_filter($features);
}
/**
* Extract path-related behavioral features
*/
private function extractPathFeatures(RequestAnalysisData $requestData, string $clientId): array
{
$features = [];
$path = $requestData->path;
// Path structure features
$features[] = $this->extractPathDepth($path);
$features[] = $this->extractPathComplexity($path);
$features[] = $this->extractPathEntropy($path);
// Path pattern features
$features[] = $this->extractPathUniqueness($clientId);
$features[] = $this->extractPathRepetition($clientId);
$features[] = $this->extractPathDiversity($clientId);
// Suspicious path characteristics
$features[] = $this->extractSuspiciousPathScore($path);
$features[] = $this->extractFileExtensionPattern($path);
$features[] = $this->extractDirectoryTraversalScore($path);
return $features;
}
/**
* Extract parameter-related behavioral features
*/
private function extractParameterFeatures(RequestAnalysisData $requestData, string $clientId): array
{
$features = [];
$allParams = $requestData->getAllParameters();
if (empty($allParams)) {
return [];
}
// Parameter count and structure
$features[] = $this->extractParameterCount($allParams);
$features[] = $this->extractParameterComplexity($allParams);
$features[] = $this->extractParameterEntropy($allParams);
// Parameter patterns
$features[] = $this->extractParameterUniqueness($clientId);
$features[] = $this->extractParameterKeyDiversity($clientId);
$features[] = $this->extractParameterValueEntropy($allParams);
// Suspicious parameter characteristics
$features[] = $this->extractSuspiciousParameterScore($allParams);
$features[] = $this->extractInjectionPatternScore($allParams);
return $features;
}
/**
* Extract sequence-based features
*/
private function extractSequenceFeatures(RequestAnalysisData $requestData, string $clientId): array
{
$features = [];
// Path sequence analysis
$pathHistory = $this->pathHistory[$clientId] ?? [];
if (count($pathHistory) >= 2) {
$features[] = $this->extractPathSequenceEntropy($pathHistory);
$features[] = $this->extractPathTransitionScore($pathHistory);
$features[] = $this->extractNavigationPattern($pathHistory);
}
return $features;
}
/**
* Extract structural features
*/
private function extractStructureFeatures(RequestAnalysisData $requestData): array
{
$features = [];
// Request structure
$features[] = $this->extractRequestComplexity($requestData);
$features[] = $this->extractHeaderToBodyRatio($requestData);
$features[] = $this->extractContentTypeConsistency($requestData);
return $features;
}
/**
* Extract path depth (number of segments)
*/
private function extractPathDepth(string $path): BehaviorFeature
{
$segments = array_filter(explode('/', trim($path, '/')));
$depth = count($segments);
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'path_depth',
value: $depth,
unit: 'segments'
);
}
/**
* Extract path complexity score
*/
private function extractPathComplexity(string $path): BehaviorFeature
{
$segments = array_filter(explode('/', trim($path, '/')));
$complexity = 0.0;
foreach ($segments as $segment) {
// Length complexity
$complexity += strlen($segment) / 20.0;
// Character diversity
$uniqueChars = count(array_unique(str_split($segment)));
$complexity += $uniqueChars / 10.0;
// Special characters
$specialChars = preg_match_all('/[^a-zA-Z0-9_-]/', $segment);
$complexity += $specialChars * 0.5;
}
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'path_complexity',
value: $complexity,
unit: 'complexity_score'
);
}
/**
* Extract path entropy
*/
private function extractPathEntropy(string $path): BehaviorFeature
{
// Character frequency distribution
$chars = str_split(strtolower($path));
$distribution = array_count_values($chars);
return BehaviorFeature::entropy(
type: $this->getBehaviorType(),
name: 'path_entropy',
distribution: array_values($distribution)
);
}
/**
* Extract path uniqueness for this client
*/
private function extractPathUniqueness(string $clientId): BehaviorFeature
{
$pathHistory = $this->pathHistory[$clientId] ?? [];
if (empty($pathHistory)) {
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'path_uniqueness',
value: 1.0,
unit: 'ratio'
);
}
$uniquePaths = count(array_unique($pathHistory));
$totalPaths = count($pathHistory);
$uniqueness = $totalPaths > 0 ? $uniquePaths / $totalPaths : 0.0;
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'path_uniqueness',
value: $uniqueness,
unit: 'ratio'
);
}
/**
* Extract path repetition score
*/
private function extractPathRepetition(string $clientId): BehaviorFeature
{
$pathHistory = $this->pathHistory[$clientId] ?? [];
if (count($pathHistory) < 2) {
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'path_repetition',
value: 0.0,
unit: 'score'
);
}
$pathCounts = array_count_values($pathHistory);
$maxCount = max($pathCounts);
$totalCount = count($pathHistory);
$repetition = $totalCount > 0 ? $maxCount / $totalCount : 0.0;
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'path_repetition',
value: $repetition,
unit: 'ratio'
);
}
/**
* Extract path diversity score
*/
private function extractPathDiversity(string $clientId): BehaviorFeature
{
$pathHistory = $this->pathHistory[$clientId] ?? [];
if (empty($pathHistory)) {
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'path_diversity',
value: 0.0,
unit: 'bits'
);
}
$pathCounts = array_count_values($pathHistory);
return BehaviorFeature::entropy(
type: $this->getBehaviorType(),
name: 'path_diversity',
distribution: array_values($pathCounts)
);
}
/**
* Extract suspicious path characteristics score
*/
private function extractSuspiciousPathScore(string $path): BehaviorFeature
{
$suspiciousScore = 0.0;
// Admin/system paths
$adminPatterns = ['/admin', '/administrator', '/config', '/debug', '/test'];
foreach ($adminPatterns as $pattern) {
if (stripos($path, $pattern) !== false) {
$suspiciousScore += 0.3;
}
}
// Encoded characters
if (preg_match('/%[0-9a-fA-F]{2}/', $path)) {
$suspiciousScore += 0.2;
}
// Double encoding
if (preg_match('/%25[0-9a-fA-F]{2}/', $path)) {
$suspiciousScore += 0.4;
}
// Null bytes
if (strpos($path, '%00') !== false) {
$suspiciousScore += 0.5;
}
// Excessive length
if (strlen($path) > 200) {
$suspiciousScore += 0.2;
}
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'suspicious_path_score',
value: min($suspiciousScore, 1.0),
unit: 'score'
);
}
/**
* Extract file extension pattern
*/
private function extractFileExtensionPattern(string $path): BehaviorFeature
{
$extension = pathinfo($path, PATHINFO_EXTENSION);
$extension = strtolower($extension);
$riskScore = 0.0;
$dangerousExtensions = [
'php', 'asp', 'aspx', 'jsp', 'py', 'pl', 'cgi', 'sh', 'bat', 'exe',
];
if (in_array($extension, $dangerousExtensions, true)) {
$riskScore = 1.0;
} elseif (! empty($extension)) {
$riskScore = 0.1; // Any extension is slightly suspicious
}
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'file_extension_risk',
value: $riskScore,
unit: 'risk_score'
);
}
/**
* Extract directory traversal score
*/
private function extractDirectoryTraversalScore(string $path): BehaviorFeature
{
$traversalScore = 0.0;
// Count directory traversal patterns
$patterns = ['../', '..\\', '%2e%2e%2f', '%2e%2e%5c'];
foreach ($patterns as $pattern) {
$matches = substr_count(strtolower($path), strtolower($pattern));
$traversalScore += $matches * 0.3;
}
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'directory_traversal_score',
value: min($traversalScore, 1.0),
unit: 'score'
);
}
/**
* Extract parameter count
*/
private function extractParameterCount(array $parameters): BehaviorFeature
{
return BehaviorFeature::create(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'parameter_count',
value: count($parameters),
unit: 'count'
);
}
/**
* Extract parameter complexity
*/
private function extractParameterComplexity(array $parameters): BehaviorFeature
{
$complexity = 0.0;
foreach ($parameters as $key => $value) {
// Key complexity
$complexity += strlen($key) / 50.0;
$complexity += preg_match_all('/[^a-zA-Z0-9_]/', $key) * 0.1;
// Value complexity
if (is_string($value)) {
$complexity += strlen($value) / 200.0;
$complexity += preg_match_all('/[^a-zA-Z0-9\\s]/', $value) * 0.05;
}
}
return BehaviorFeature::create(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'parameter_complexity',
value: $complexity,
unit: 'complexity_score'
);
}
/**
* Extract parameter key entropy
*/
private function extractParameterEntropy(array $parameters): BehaviorFeature
{
if (empty($parameters)) {
return BehaviorFeature::create(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'parameter_entropy',
value: 0.0,
unit: 'bits'
);
}
// Character distribution across all parameter keys
$allKeys = implode('', array_keys($parameters));
$chars = str_split(strtolower($allKeys));
$distribution = array_count_values($chars);
return BehaviorFeature::entropy(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'parameter_entropy',
distribution: array_values($distribution)
);
}
/**
* Extract parameter uniqueness for this client
*/
private function extractParameterUniqueness(string $clientId): BehaviorFeature
{
$paramHistory = $this->parameterHistory[$clientId] ?? [];
if (empty($paramHistory)) {
return BehaviorFeature::create(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'parameter_uniqueness',
value: 1.0,
unit: 'ratio'
);
}
$uniqueParams = count(array_unique($paramHistory, SORT_REGULAR));
$totalParams = count($paramHistory);
$uniqueness = $totalParams > 0 ? $uniqueParams / $totalParams : 0.0;
return BehaviorFeature::create(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'parameter_uniqueness',
value: $uniqueness,
unit: 'ratio'
);
}
/**
* Extract parameter key diversity
*/
private function extractParameterKeyDiversity(string $clientId): BehaviorFeature
{
$paramHistory = $this->parameterHistory[$clientId] ?? [];
if (empty($paramHistory)) {
return BehaviorFeature::create(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'parameter_key_diversity',
value: 0.0,
unit: 'bits'
);
}
// Collect all parameter keys
$allKeys = [];
foreach ($paramHistory as $params) {
if (is_array($params)) {
$allKeys = array_merge($allKeys, array_keys($params));
}
}
$keyCounts = array_count_values($allKeys);
return BehaviorFeature::entropy(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'parameter_key_diversity',
distribution: array_values($keyCounts)
);
}
/**
* Extract parameter value entropy
*/
private function extractParameterValueEntropy(array $parameters): BehaviorFeature
{
if (empty($parameters)) {
return BehaviorFeature::create(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'parameter_value_entropy',
value: 0.0,
unit: 'bits'
);
}
// Character distribution across all parameter values
$allValues = implode('', array_filter(array_values($parameters), 'is_string'));
if (empty($allValues)) {
return BehaviorFeature::create(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'parameter_value_entropy',
value: 0.0,
unit: 'bits'
);
}
$chars = str_split(strtolower($allValues));
$distribution = array_count_values($chars);
return BehaviorFeature::entropy(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'parameter_value_entropy',
distribution: array_values($distribution)
);
}
/**
* Extract suspicious parameter score
*/
private function extractSuspiciousParameterScore(array $parameters): BehaviorFeature
{
$suspiciousScore = 0.0;
$suspiciousKeys = [
'eval', 'exec', 'system', 'cmd', 'command', 'shell',
'admin', 'root', 'password', 'pass', 'auth', 'token',
'debug', 'test', 'dev', 'config', 'settings',
];
foreach ($parameters as $key => $value) {
$lowerKey = strtolower($key);
// Check for suspicious parameter names
foreach ($suspiciousKeys as $suspicious) {
if (strpos($lowerKey, $suspicious) !== false) {
$suspiciousScore += 0.3;
}
}
// Check for encoded values
if (is_string($value) && preg_match('/%[0-9a-fA-F]{2}/', $value)) {
$suspiciousScore += 0.1;
}
// Check for extremely long values
if (is_string($value) && strlen($value) > 1000) {
$suspiciousScore += 0.2;
}
}
return BehaviorFeature::create(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'suspicious_parameter_score',
value: min($suspiciousScore, 1.0),
unit: 'score'
);
}
/**
* Extract injection pattern score
*/
private function extractInjectionPatternScore(array $parameters): BehaviorFeature
{
$injectionScore = 0.0;
$injectionPatterns = [
'sql' => ['/union\\s+select/i', '/or\\s+1\\s*=\\s*1/i', '/\\s*;\\s*drop\\s+table/i'],
'xss' => ['/<script/i', '/javascript:/i', '/onerror\\s*=/i'],
'cmd' => ['/;\\s*(cat|ls|pwd|id)/i', '/\\|\\s*(nc|netcat)/i'],
];
foreach ($parameters as $key => $value) {
if (! is_string($value)) {
continue;
}
foreach ($injectionPatterns as $type => $patterns) {
foreach ($patterns as $pattern) {
if (preg_match($pattern, $value)) {
$injectionScore += 0.4;
break 2; // Break out of both loops
}
}
}
}
return BehaviorFeature::create(
type: BehaviorType::PARAMETER_PATTERNS,
name: 'injection_pattern_score',
value: min($injectionScore, 1.0),
unit: 'score'
);
}
/**
* Extract path sequence entropy
*/
private function extractPathSequenceEntropy(array $pathHistory): BehaviorFeature
{
// Create bigrams (consecutive path pairs)
$bigrams = [];
for ($i = 0; $i < count($pathHistory) - 1; $i++) {
$bigram = $pathHistory[$i] . ' -> ' . $pathHistory[$i + 1];
$bigrams[] = $bigram;
}
$bigramCounts = array_count_values($bigrams);
return BehaviorFeature::entropy(
type: $this->getBehaviorType(),
name: 'path_sequence_entropy',
distribution: array_values($bigramCounts)
);
}
/**
* Extract path transition score
*/
private function extractPathTransitionScore(array $pathHistory): BehaviorFeature
{
if (count($pathHistory) < 2) {
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'path_transition_score',
value: 0.0,
unit: 'score'
);
}
$transitionScore = 0.0;
for ($i = 0; $i < count($pathHistory) - 1; $i++) {
$current = $pathHistory[$i];
$next = $pathHistory[$i + 1];
// Calculate path similarity (Levenshtein distance)
$similarity = 1.0 - (levenshtein($current, $next) / max(strlen($current), strlen($next)));
$transitionScore += $similarity;
}
$averageTransition = $transitionScore / (count($pathHistory) - 1);
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'path_transition_score',
value: $averageTransition,
unit: 'similarity_score'
);
}
/**
* Extract navigation pattern
*/
private function extractNavigationPattern(array $pathHistory): BehaviorFeature
{
$backtrackingScore = 0.0;
// Detect backtracking patterns (returning to previously visited paths)
for ($i = 2; $i < count($pathHistory); $i++) {
$current = $pathHistory[$i];
// Check if current path was visited in the last few requests
for ($j = max(0, $i - 5); $j < $i; $j++) {
if ($pathHistory[$j] === $current) {
$backtrackingScore += 1.0 / ($i - $j); // More recent = higher score
break;
}
}
}
$normalizedScore = count($pathHistory) > 2 ? $backtrackingScore / (count($pathHistory) - 2) : 0.0;
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'navigation_backtracking',
value: $normalizedScore,
unit: 'backtracking_score'
);
}
/**
* Extract request complexity
*/
private function extractRequestComplexity(RequestAnalysisData $requestData): BehaviorFeature
{
$complexity = 0.0;
// Path complexity
$complexity += strlen($requestData->path) / 100.0;
// Parameter complexity
$paramCount = count($requestData->getAllParameters());
$complexity += $paramCount / 20.0;
// Header complexity
$headerCount = count($requestData->headers);
$complexity += $headerCount / 30.0;
// Body complexity
$bodySize = strlen($requestData->body);
$complexity += $bodySize / 5000.0;
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'request_complexity',
value: $complexity,
unit: 'complexity_score'
);
}
/**
* Extract header to body ratio
*/
private function extractHeaderToBodyRatio(RequestAnalysisData $requestData): BehaviorFeature
{
$headerSize = array_sum(array_map(
fn ($name, $value) => strlen($name) + strlen($value),
array_keys($requestData->headers),
array_values($requestData->headers)
));
$bodySize = strlen($requestData->body);
$ratio = ($headerSize + $bodySize) > 0 ? $headerSize / ($headerSize + $bodySize) : 0.0;
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'header_body_ratio',
value: $ratio,
unit: 'ratio'
);
}
/**
* Extract content type consistency
*/
private function extractContentTypeConsistency(RequestAnalysisData $requestData): BehaviorFeature
{
$consistencyScore = 1.0;
// Check if content type matches the actual content
if ($requestData->contentType !== null) {
if ($requestData->isJson() && ! empty($requestData->body)) {
json_decode($requestData->body);
if (json_last_error() !== JSON_ERROR_NONE) {
$consistencyScore -= 0.5;
}
}
if ($requestData->isXml() && ! empty($requestData->body)) {
$previousSetting = libxml_use_internal_errors(true);
simplexml_load_string($requestData->body);
$errors = libxml_get_errors();
libxml_use_internal_errors($previousSetting);
libxml_clear_errors();
if (! empty($errors)) {
$consistencyScore -= 0.5;
}
}
}
return BehaviorFeature::create(
type: $this->getBehaviorType(),
name: 'content_type_consistency',
value: max(0.0, $consistencyScore),
unit: 'consistency_score'
);
}
/**
* Record patterns for this client
*/
private function recordPatterns(string $clientId, RequestAnalysisData $requestData): void
{
// Record path
if (! isset($this->pathHistory[$clientId])) {
$this->pathHistory[$clientId] = [];
}
$this->pathHistory[$clientId][] = $requestData->path;
// Limit history size
if (count($this->pathHistory[$clientId]) > $this->historySize) {
array_shift($this->pathHistory[$clientId]);
}
// Record parameters
if (! isset($this->parameterHistory[$clientId])) {
$this->parameterHistory[$clientId] = [];
}
$allParams = $requestData->getAllParameters();
if (! empty($allParams)) {
$this->parameterHistory[$clientId][] = $allParams;
// Limit history size
if (count($this->parameterHistory[$clientId]) > $this->historySize) {
array_shift($this->parameterHistory[$clientId]);
}
}
}
/**
* Get client identifier
*/
private function getClientId(RequestAnalysisData $requestData): string
{
if (! empty($requestData->sessionId)) {
return 'session:' . $requestData->sessionId;
}
if ($requestData->clientIp !== null) {
return 'ip:' . $requestData->clientIp->toString();
}
return 'unknown';
}
public function getFeatureNames(): array
{
return [
// Path features
'path_depth', 'path_complexity', 'path_entropy', 'path_uniqueness',
'path_repetition', 'path_diversity', 'suspicious_path_score',
'file_extension_risk', 'directory_traversal_score',
// Parameter features
'parameter_count', 'parameter_complexity', 'parameter_entropy',
'parameter_uniqueness', 'parameter_key_diversity', 'parameter_value_entropy',
'suspicious_parameter_score', 'injection_pattern_score',
// Sequence features
'path_sequence_entropy', 'path_transition_score', 'navigation_backtracking',
// Structure features
'request_complexity', 'header_body_ratio', 'content_type_consistency',
];
}
public function getConfiguration(): array
{
return [
'enabled' => $this->enabled,
'max_path_segments' => $this->maxPathSegments,
'max_parameter_keys' => $this->maxParameterKeys,
'history_size' => $this->historySize,
'feature_count' => count($this->getFeatureNames()),
];
}
public function isEnabled(): bool
{
return $this->enabled;
}
public function getPriority(): int
{
return 80; // Medium-high priority
}
public function getExpectedProcessingTime(): int
{
return 75; // milliseconds
}
public function supportsParallelExecution(): bool
{
return false; // Needs sequential access for pattern history
}
public function getDependencies(): array
{
return []; // No dependencies
}
}