'; private const DOCTYPE_UPPER = '!DOCTYPE'; private const DOCTYPE_LOWER = '!doctype'; private const CDATA_START = '![CDATA['; private const CDATA_END = ']]>'; private const TAG_NAME_PATTERN = '/<([a-z][a-z0-9-]*)/i'; /** @var array Raw content tags (script, style, etc.) */ private const RAW_TEXT_TAGS = ['script', 'style']; private string $html; private int $position; private int $length; private ?string $currentTagName = null; public function __construct(string $html) { $this->html = $html; $this->position = 0; $this->length = mb_strlen($html, '8bit'); } /** * Tokenize HTML and return array of tokens * * @return array */ public function tokenize(): array { $tokens = []; $this->position = 0; while ($this->position < $this->length) { // Check for tag start if ($this->current() === '<') { $token = $this->consumeTag(); $tokens[] = $token; } else { // Consume content until next tag $content = $this->consumeUntil('<'); if ($content !== '') { $tokens[] = new Token($content, TokenType::CONTENT); } } } return $tokens; } /** * Normalize self-closing x-components to explicit closing tags */ public function normalizeXComponents(): string { $tokens = $this->tokenize(); $parts = []; foreach ($tokens as $token) { if ($token->type === TokenType::SELF_CLOSING_TAG && $this->isXComponent($token->content)) { // Convert to $tagContent = trim($token->content); $tagContent = rtrim($tagContent, '/>'); $tagContent = trim($tagContent); // Extract tag name (e.g., "x-counter" from "'; } else { // Fallback: keep original if we can't parse $parts[] = $token->content; } } else { $parts[] = $token->content; } } return implode('', $parts); } private function consumeTag(): Token { $start = $this->position; // Consume '<' $this->advance(); // Check for special cases if ($this->peek(mb_strlen(self::COMMENT_START, '8bit')) === self::COMMENT_START) { return $this->consumeComment($start); } if ($this->peek(mb_strlen(self::CDATA_START, '8bit')) === self::CDATA_START) { return $this->consumeCData($start); } if ($this->peek(mb_strlen(self::DOCTYPE_UPPER, '8bit')) === self::DOCTYPE_UPPER || $this->peek(mb_strlen(self::DOCTYPE_LOWER, '8bit')) === self::DOCTYPE_LOWER) { return $this->consumeDoctype($start); } // Check for closing tag if ($this->current() === '/') { return $this->consumeClosingTag($start); } // Consume opening tag return $this->consumeOpeningTag($start); } private function consumeCData(int $start): Token { // Consume until ']]>' $cdataEndLength = mb_strlen(self::CDATA_END, '8bit'); while ($this->position < $this->length) { if ($this->peek($cdataEndLength) === self::CDATA_END) { for ($i = 0; $i < $cdataEndLength; $i++) { $this->advance(); } break; } $this->advance(); } return new Token(substr($this->html, $start, $this->position - $start), TokenType::CDATA); } private function consumeOpeningTag(int $start): Token { $tagNameStart = $this->position; // Consume tag name while ($this->position < $this->length && !ctype_space($this->current()) && $this->current() !== '>' && $this->current() !== '/') { $this->advance(); } // Extract tag name for raw text handling $tagName = strtolower(substr($this->html, $tagNameStart, $this->position - $tagNameStart)); // Consume attributes with proper quote handling $inQuote = false; $quoteChar = ''; while ($this->position < $this->length && $this->current() !== '>') { $char = $this->current(); // Handle quotes in attributes if (($char === '"' || $char === "'") && !$inQuote) { $inQuote = true; $quoteChar = $char; } elseif ($inQuote && $char === $quoteChar) { $inQuote = false; $quoteChar = ''; } $this->advance(); // Don't break on '>' inside quotes if ($this->current() === '>' && $inQuote) { continue; } } // Check if this is a self-closing tag if ($this->position > 0 && $this->html[$this->position - 1] === '/') { // Self-closing tag detected: $this->advance(); // consume '>' return new Token(substr($this->html, $start, $this->position - $start), TokenType::SELF_CLOSING_TAG); } // Regular opening tag if ($this->current() === '>') { $this->advance(); // consume '>' } // Track if we entered a raw text tag if (in_array($tagName, self::RAW_TEXT_TAGS, true)) { $this->currentTagName = $tagName; } return new Token(substr($this->html, $start, $this->position - $start), TokenType::OPEN_TAG_START); } private function consumeClosingTag(int $start): Token { // Consume '/' $this->advance(); $tagNameStart = $this->position; // Consume tag name while ($this->position < $this->length && !ctype_space($this->current()) && $this->current() !== '>') { $this->advance(); } $tagName = strtolower(substr($this->html, $tagNameStart, $this->position - $tagNameStart)); // Consume until '>' while ($this->position < $this->length && $this->current() !== '>') { $this->advance(); } if ($this->current() === '>') { $this->advance(); } // Reset current tag if we're closing a raw text tag if ($this->currentTagName === $tagName) { $this->currentTagName = null; } return new Token(substr($this->html, $start, $this->position - $start), TokenType::CLOSING_TAG); } private function consumeComment(int $start): Token { // Consume until '-->' $commentEndLength = mb_strlen(self::COMMENT_END, '8bit'); while ($this->position < $this->length) { if ($this->peek($commentEndLength) === self::COMMENT_END) { // Consume all three characters: --> for ($i = 0; $i < $commentEndLength; $i++) { $this->advance(); } break; } $this->advance(); } return new Token(substr($this->html, $start, $this->position - $start), TokenType::COMMENT); } private function consumeDoctype(int $start): Token { // Consume until '>' while ($this->position < $this->length && $this->current() !== '>') { $this->advance(); } if ($this->current() === '>') { $this->advance(); } return new Token(substr($this->html, $start, $this->position - $start), TokenType::DOCTYPE); } private function consumeUntil(string $char): string { $start = $this->position; // If we're inside a raw text tag (script/style), consume until closing tag if ($this->currentTagName !== null) { $closingTag = 'currentTagName; while ($this->position < $this->length) { if ($this->peek(mb_strlen($closingTag, '8bit')) === $closingTag) { break; } $this->advance(); } } else { while ($this->position < $this->length && $this->current() !== $char) { $this->advance(); } } return substr($this->html, $start, $this->position - $start); } private function isXComponent(string $tagContent): bool { // Check if tag starts with position >= $this->length) { return ''; } return $this->html[$this->position]; } private function peek(int $length): string { if ($this->position + $length > $this->length) { return ''; } return substr($this->html, $this->position, $length); } private function advance(): void { $this->position++; } }