michaelschiemer/src/Framework/Tokenizer/ValueObjects/TokenCollection.php

<?php

declare(strict_types=1);

namespace App\Framework\Tokenizer\ValueObjects;

use ArrayIterator;
use Countable;
use IteratorAggregate;
use Traversable;

/**
 * Collection of tokens with utility methods
 */
final readonly class TokenCollection implements IteratorAggregate, Countable
{
    /**
     * @param array<Token> $tokens
     */
    public function __construct(
        private array $tokens = []
    ) {
    }

    /**
     * Get iterator for the collection
     */
    public function getIterator(): Traversable
    {
        return new ArrayIterator($this->tokens);
    }

    /**
     * Count tokens in collection
     */
    public function count(): int
    {
        return count($this->tokens);
    }

    /**
     * Get all tokens as array
     * @return array<Token>
     */
    public function toArray(): array
    {
        return $this->tokens;
    }

    /**
     * Get token at index
     */
    public function get(int $index): ?Token
    {
        return $this->tokens[$index] ?? null;
    }

    /**
     * Filter tokens by predicate
     */
    public function filter(callable $predicate): self
    {
        return new self(array_values(array_filter($this->tokens, $predicate)));
    }

    /**
     * Filter by token type
     */
    public function filterByType(TokenType ...$types): self
    {
        return $this->filter(fn (Token $token) => in_array($token->type, $types, true));
    }

    /**
     * Filter by PHP token ID
     */
    public function filterById(int ...$ids): self
    {
        return $this->filter(fn (Token $token) => in_array($token->id, $ids, true));
    }

    /**
     * Get only structural tokens
     */
    public function getStructural(): self
    {
        return $this->filter(fn (Token $token) => $token->isStructural());
    }

    /**
     * Get only identifiers
     */
    public function getIdentifiers(): self
    {
        return $this->filter(fn (Token $token) => $token->isIdentifier());
    }

    /**
     * Get tokens in line range
     */
    public function getInLineRange(int $startLine, int $endLine): self
    {
        return $this->filter(
            fn (Token $token) =>
            $token->line >= $startLine && $token->line <= $endLine
        );
    }

    /**
     * Get tokens at specific line
     */
    public function getAtLine(int $line): self
    {
        return $this->filter(fn (Token $token) => $token->line === $line);
    }

    /**
     * Find first token matching predicate
     */
    public function findFirst(callable $predicate): ?Token
    {
        foreach ($this->tokens as $token) {
            if ($predicate($token)) {
                return $token;
            }
        }

        return null;
    }

    /**
     * Find first token of type
     */
    public function findFirstOfType(TokenType $type): ?Token
    {
        return $this->findFirst(fn (Token $token) => $token->type === $type);
    }

    /**
     * Map tokens to another form
     */
    public function map(callable $mapper): array
    {
        return array_map($mapper, $this->tokens);
    }

    /**
     * Extract all values
     */
    public function getValues(): array
    {
        return $this->map(fn (Token $token) => $token->value);
    }

    /**
     * Extract all clean values
     */
    public function getCleanValues(): array
    {
        return $this->map(fn (Token $token) => $token->getCleanValue());
    }

    /**
     * Get tokens grouped by type
     * @return array<string, array<Token>>
     */
    public function groupByType(): array
    {
        $groups = [];
        foreach ($this->tokens as $token) {
            $type = $token->type->value;
            if (! isset($groups[$type])) {
                $groups[$type] = [];
            }
            $groups[$type][] = $token;
        }

        return $groups;
    }

    /**
     * Get tokens grouped by line
     * @return array<int, array<Token>>
     */
    public function groupByLine(): array
    {
        $groups = [];
        foreach ($this->tokens as $token) {
            // Split whitespace tokens that span multiple lines
            if ($token->type === TokenType::WHITESPACE && str_contains($token->value, "\n")) {
                $splitTokens = $this->splitWhitespaceAtNewlines($token);
                foreach ($splitTokens as $splitToken) {
                    if (! isset($groups[$splitToken->line])) {
                        $groups[$splitToken->line] = [];
                    }
                    $groups[$splitToken->line][] = $splitToken;
                }
            } else {
                if (! isset($groups[$token->line])) {
                    $groups[$token->line] = [];
                }
                $groups[$token->line][] = $token;
            }
        }

        return $groups;
    }

    /**
     * Split a whitespace token at newline boundaries
     *
     * When a whitespace token spans multiple lines like "\n    ", the leading spaces
     * after the newline belong to the NEXT line, not the current one.
     *
     * Example: Token with value "\n    " on line 13:
     * - "\n" belongs to line 13 (ends the line)
     * - "    " belongs to line 14 (starts the next line)
     *
     * @return Token[]
     */
    private function splitWhitespaceAtNewlines(Token $token): array
    {
        $parts = preg_split('/(\r\n|\n|\r)/', $token->value, -1, PREG_SPLIT_DELIM_CAPTURE);
        $result = [];

        // Calculate starting line: token.line is where it ENDS
        $startLine = $token->line - substr_count($token->value, "\n");
        $currentLine = $startLine;

        for ($i = 0; $i < count($parts); $i++) {
            $part = $parts[$i];

            // Skip empty parts
            if ($part === '') {
                continue;
            }

            // Check if this is a newline delimiter
            $isNewline = in_array($part, ["\r\n", "\n", "\r"], true);

            if ($isNewline) {
                // Newline belongs to current line (ends it)
                $result[] = new Token(
                    type: TokenType::WHITESPACE,
                    value: $part,
                    line: $currentLine,
                    position: $token->position,
                    id: $token->id,
                    context: $token->context
                );
                $currentLine++;
            } else {
                // Non-newline whitespace: assign to current line
                // (which is the line AFTER the newline if we just processed one)
                $result[] = new Token(
                    type: TokenType::WHITESPACE,
                    value: $part,
                    line: $currentLine,
                    position: $token->position,
                    id: $token->id,
                    context: $token->context
                );
            }
        }

        return $result;
    }

    /**
     * Check if collection is empty
     */
    public function isEmpty(): bool
    {
        return empty($this->tokens);
    }

    /**
     * Get first token
     */
    public function first(): ?Token
    {
        return $this->tokens[0] ?? null;
    }

    /**
     * Get last token
     */
    public function last(): ?Token
    {
        return empty($this->tokens) ? null : $this->tokens[count($this->tokens) - 1];
    }

    /**
     * Slice collection
     */
    public function slice(int $offset, ?int $length = null): self
    {
        return new self(array_slice($this->tokens, $offset, $length));
    }

    /**
     * Merge with another collection
     */
    public function merge(self $other): self
    {
        return new self(array_merge($this->tokens, $other->tokens));
    }

    /**
     * Convert to string (concatenate all values)
     */
    public function toString(): string
    {
        return implode('', $this->getValues());
    }
}