michaelschiemer/src/Framework/View/Lexer/HtmlLexer.php

<?php

declare(strict_types=1);

namespace App\Framework\View\Lexer;

final class HtmlLexer
{
    private const COMMENT_START = '!--';
    private const COMMENT_END = '-->';
    private const DOCTYPE_UPPER = '!DOCTYPE';
    private const DOCTYPE_LOWER = '!doctype';
    private const CDATA_START = '![CDATA[';
    private const CDATA_END = ']]>';
    private const TAG_NAME_PATTERN = '/<([a-z][a-z0-9-]*)/i';

    /** @var array<string> Raw content tags (script, style, etc.) */
    private const RAW_TEXT_TAGS = ['script', 'style'];

    private string $html;
    private int $position;
    private int $length;
    private ?string $currentTagName = null;

    public function __construct(string $html)
    {
        $this->html = $html;
        $this->position = 0;
        $this->length = mb_strlen($html, '8bit');
    }

    /**
     * Tokenize HTML and return array of tokens
     *
     * @return array<Token>
     */
    public function tokenize(): array
    {
        $tokens = [];
        $this->position = 0;

        while ($this->position < $this->length) {
            // Check for tag start
            if ($this->current() === '<') {
                $token = $this->consumeTag();
                $tokens[] = $token;
            } else {
                // Consume content until next tag
                $content = $this->consumeUntil('<');
                if ($content !== '') {
                    $tokens[] = new Token($content, TokenType::CONTENT);
                }
            }
        }

        return $tokens;
    }

    /**
     * Normalize self-closing x-components to explicit closing tags
     */
    public function normalizeXComponents(): string
    {
        $tokens = $this->tokenize();
        $parts = [];

        foreach ($tokens as $token) {
            if ($token->type === TokenType::SELF_CLOSING_TAG && $this->isXComponent($token->content)) {
                // Convert <x-foo /> to <x-foo></x-foo>
                $tagContent = trim($token->content);
                $tagContent = rtrim($tagContent, '/>');
                $tagContent = trim($tagContent);

                // Extract tag name (e.g., "x-counter" from "<x-counter" or "<x-counter attr='val'")
                preg_match(self::TAG_NAME_PATTERN, $tagContent, $matches);
                $tagName = $matches[1] ?? '';

                if ($tagName !== '') {
                    $parts[] = $tagContent . '></' . $tagName . '>';
                } else {
                    // Fallback: keep original if we can't parse
                    $parts[] = $token->content;
                }
            } else {
                $parts[] = $token->content;
            }
        }

        return implode('', $parts);
    }

    private function consumeTag(): Token
    {
        $start = $this->position;

        // Consume '<'
        $this->advance();

        // Check for special cases
        if ($this->peek(mb_strlen(self::COMMENT_START, '8bit')) === self::COMMENT_START) {
            return $this->consumeComment($start);
        }

        if ($this->peek(mb_strlen(self::CDATA_START, '8bit')) === self::CDATA_START) {
            return $this->consumeCData($start);
        }

        if ($this->peek(mb_strlen(self::DOCTYPE_UPPER, '8bit')) === self::DOCTYPE_UPPER
            || $this->peek(mb_strlen(self::DOCTYPE_LOWER, '8bit')) === self::DOCTYPE_LOWER) {
            return $this->consumeDoctype($start);
        }

        // Check for closing tag
        if ($this->current() === '/') {
            return $this->consumeClosingTag($start);
        }

        // Consume opening tag
        return $this->consumeOpeningTag($start);
    }

    private function consumeCData(int $start): Token
    {
        // Consume until ']]>'
        $cdataEndLength = mb_strlen(self::CDATA_END, '8bit');

        while ($this->position < $this->length) {
            if ($this->peek($cdataEndLength) === self::CDATA_END) {
                for ($i = 0; $i < $cdataEndLength; $i++) {
                    $this->advance();
                }
                break;
            }
            $this->advance();
        }

        return new Token(substr($this->html, $start, $this->position - $start), TokenType::CDATA);
    }

    private function consumeOpeningTag(int $start): Token
    {
        $tagNameStart = $this->position;

        // Consume tag name
        while ($this->position < $this->length
            && !ctype_space($this->current())
            && $this->current() !== '>'
            && $this->current() !== '/') {
            $this->advance();
        }

        // Extract tag name for raw text handling
        $tagName = strtolower(substr($this->html, $tagNameStart, $this->position - $tagNameStart));

        // Consume attributes with proper quote handling
        $inQuote = false;
        $quoteChar = '';

        while ($this->position < $this->length && $this->current() !== '>') {
            $char = $this->current();

            // Handle quotes in attributes
            if (($char === '"' || $char === "'") && !$inQuote) {
                $inQuote = true;
                $quoteChar = $char;
            } elseif ($inQuote && $char === $quoteChar) {
                $inQuote = false;
                $quoteChar = '';
            }

            $this->advance();

            // Don't break on '>' inside quotes
            if ($this->current() === '>' && $inQuote) {
                continue;
            }
        }

        // Check if this is a self-closing tag
        if ($this->position > 0 && $this->html[$this->position - 1] === '/') {
            // Self-closing tag detected: <tag />
            $this->advance(); // consume '>'
            return new Token(substr($this->html, $start, $this->position - $start), TokenType::SELF_CLOSING_TAG);
        }

        // Regular opening tag
        if ($this->current() === '>') {
            $this->advance(); // consume '>'
        }

        // Track if we entered a raw text tag
        if (in_array($tagName, self::RAW_TEXT_TAGS, true)) {
            $this->currentTagName = $tagName;
        }

        return new Token(substr($this->html, $start, $this->position - $start), TokenType::OPEN_TAG_START);
    }

    private function consumeClosingTag(int $start): Token
    {
        // Consume '/'
        $this->advance();

        $tagNameStart = $this->position;

        // Consume tag name
        while ($this->position < $this->length
            && !ctype_space($this->current())
            && $this->current() !== '>') {
            $this->advance();
        }

        $tagName = strtolower(substr($this->html, $tagNameStart, $this->position - $tagNameStart));

        // Consume until '>'
        while ($this->position < $this->length && $this->current() !== '>') {
            $this->advance();
        }

        if ($this->current() === '>') {
            $this->advance();
        }

        // Reset current tag if we're closing a raw text tag
        if ($this->currentTagName === $tagName) {
            $this->currentTagName = null;
        }

        return new Token(substr($this->html, $start, $this->position - $start), TokenType::CLOSING_TAG);
    }

    private function consumeComment(int $start): Token
    {
        // Consume until '-->'
        $commentEndLength = mb_strlen(self::COMMENT_END, '8bit');

        while ($this->position < $this->length) {
            if ($this->peek($commentEndLength) === self::COMMENT_END) {
                // Consume all three characters: -->
                for ($i = 0; $i < $commentEndLength; $i++) {
                    $this->advance();
                }
                break;
            }
            $this->advance();
        }

        return new Token(substr($this->html, $start, $this->position - $start), TokenType::COMMENT);
    }

    private function consumeDoctype(int $start): Token
    {
        // Consume until '>'
        while ($this->position < $this->length && $this->current() !== '>') {
            $this->advance();
        }

        if ($this->current() === '>') {
            $this->advance();
        }

        return new Token(substr($this->html, $start, $this->position - $start), TokenType::DOCTYPE);
    }

    private function consumeUntil(string $char): string
    {
        $start = $this->position;

        // If we're inside a raw text tag (script/style), consume until closing tag
        if ($this->currentTagName !== null) {
            $closingTag = '</' . $this->currentTagName;

            while ($this->position < $this->length) {
                if ($this->peek(mb_strlen($closingTag, '8bit')) === $closingTag) {
                    break;
                }
                $this->advance();
            }
        } else {
            while ($this->position < $this->length && $this->current() !== $char) {
                $this->advance();
            }
        }

        return substr($this->html, $start, $this->position - $start);
    }

    private function isXComponent(string $tagContent): bool
    {
        // Check if tag starts with <x- (case-insensitive)
        return preg_match('/^<x-[a-z0-9][a-z0-9-]*/i', $tagContent) === 1;
    }

    private function current(): string
    {
        if ($this->position >= $this->length) {
            return '';
        }

        return $this->html[$this->position];
    }

    private function peek(int $length): string
    {
        if ($this->position + $length > $this->length) {
            return '';
        }

        return substr($this->html, $this->position, $length);
    }

    private function advance(): void
    {
        $this->position++;
    }
}