<?php
declare(strict_types=1);
namespace voku\helper;
class HtmlDomParser extends AbstractDomParser
{
private $callbackXPathBeforeQuery;
private $callbackBeforeCreateDom;
protected static $functionAliases = [
'outertext' => 'html',
'outerhtml' => 'html',
'innertext' => 'innerHtml',
'innerhtml' => 'innerHtml',
'load' => 'loadHtml',
'load_file' => 'loadHtmlFile',
];
protected $templateLogicSyntaxInSpecialScriptTags = [
'+',
'<%',
'{%',
'{{',
];
protected $specialScriptTags = [
'text/html',
'text/template',
'text/x-custom-template',
'text/x-handlebars-template',
];
protected $selfClosingTags = [
'area',
'base',
'br',
'col',
'command',
'embed',
'hr',
'img',
'input',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr',
];
protected $isDOMDocumentCreatedWithoutHtml = false;
protected $isDOMDocumentCreatedWithoutWrapper = false;
protected $isDOMDocumentCreatedWithCommentWrapper = false;
protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
protected $isDOMDocumentCreatedWithMultiRoot = false;
protected $isDOMDocumentCreatedWithEdgeWhitespace = false;
protected $isDOMDocumentCreatedWithFakeEndScript = false;
protected $createdFromNode = false;
protected $keepBrokenHtml = false;
public function __construct($element = null)
{
$this->document = new \DOMDocument('1.0', $this->getEncoding());
$this->document->preserveWhiteSpace = true;
$this->document->formatOutput = false;
if ($element instanceof SimpleHtmlDomInterface) {
$element = $element->getNode();
}
if ($element instanceof \DOMDocument) {
$html = $element->saveHTML();
if ($html !== false) {
$this->loadHtml($html);
}
return;
}
if ($element instanceof \DOMNode) {
$this->createdFromNode = true;
$domNode = $this->document->importNode($element, true);
if ($domNode instanceof \DOMNode) {
$this->document->appendChild($domNode);
}
return;
}
if ($element !== null) {
$this->loadHtml($element);
}
}
public function __call($name, $arguments)
{
$name = \strtolower($name);
if (isset(self::$functionAliases[$name])) {
$method = self::$functionAliases[$name];
return $this->{$method}(...$arguments);
}
throw new \BadMethodCallException('Method does not exist: ' . $name);
}
public static function __callStatic($name, $arguments)
{
$arguments0 = $arguments[0] ?? '';
$arguments1 = $arguments[1] ?? null;
if ($name === 'str_get_html') {
$parser = self::createStaticParser();
return $parser->loadHtml($arguments0, $arguments1);
}
if ($name === 'file_get_html') {
$parser = self::createStaticParser();
return $parser->loadHtmlFile($arguments0, $arguments1);
}
throw new \BadMethodCallException('Method does not exist');
}
private static function createStaticParser()
{
return new static();
}
public function __get($name)
{
$name = \strtolower($name);
switch ($name) {
case 'outerhtml':
case 'outertext':
return $this->html();
case 'innerhtml':
case 'innertext':
return $this->innerHtml();
case 'innerhtmlkeep':
return $this->innerHtml(false, false);
case 'text':
case 'plaintext':
return $this->text();
}
return null;
}
public function __toString()
{
return $this->html();
}
public function clear(): bool
{
return true;
}
protected function createDOMDocument(string $html, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): \DOMDocument
{
$this->resetDynamicDomHelpers();
if ($this->callbackBeforeCreateDom) {
$html = \call_user_func($this->callbackBeforeCreateDom, $html, $this);
}
$isDOMDocumentCreatedWithDoctype = false;
if (\stripos($html, '<!DOCTYPE') !== false) {
$isDOMDocumentCreatedWithDoctype = true;
if (
\preg_match('/(^.*?)<!DOCTYPE(?: [^>]*)?>/sui', $html, $matches_before_doctype)
&&
\trim($matches_before_doctype[1])
) {
$html = \str_replace($matches_before_doctype[1], '', $html);
}
}
if ($this->keepBrokenHtml) {
$html = $this->keepBrokenHtml(\trim($html));
}
if (\strpos($html, '<') === false) {
$this->isDOMDocumentCreatedWithoutHtml = true;
} elseif (\strpos(\ltrim($html), '<') !== 0) {
$this->isDOMDocumentCreatedWithoutWrapper = true;
}
if (\strpos(\ltrim($html), '<!--') === 0) {
$this->isDOMDocumentCreatedWithCommentWrapper = true;
}
if (
\strpos($html, '<html ') === false
&&
\strpos($html, '<html>') === false
) {
$this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
}
if (
\strpos($html, '<body ') === false
&&
\strpos($html, '<body>') === false
) {
$this->isDOMDocumentCreatedWithoutBodyWrapper = true;
}
if (
$this->isDOMDocumentCreatedWithoutHtmlWrapper
&&
$this->isDOMDocumentCreatedWithoutBodyWrapper
&&
\trim($html) !== $html
&&
\substr_count($html, '</') >= 2
&&
\preg_match('#^\s*<([a-zA-Z][^\\s>/]*)>.*?</\\1>#su', $html) === 1
) {
$this->isDOMDocumentCreatedWithEdgeWhitespace = true;
}
if (
\strpos($html, '<head ') === false
&&
\strpos($html, '<head>') === false
) {
$this->isDOMDocumentCreatedWithoutHeadWrapper = true;
}
if (
\stripos($html, '<p ') === false
&&
\stripos($html, '<p>') === false
) {
$this->isDOMDocumentCreatedWithoutPTagWrapper = true;
}
if (
\strpos($html, '</script>') === false
&&
\strpos($html, '<\/script>') !== false
) {
$this->isDOMDocumentCreatedWithFakeEndScript = true;
}
if (\stripos($html, '</html>') !== false) {
if (
\preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html)
&&
\trim($matches_after_html[1])
) {
$html = \str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html);
}
}
if (\strpos($html, '<script') !== false) {
foreach ($this->specialScriptTags as $tag) {
if (\strpos($html, $tag) !== false) {
$this->keepSpecialScriptTags($html);
break;
}
}
$this->html5FallbackForScriptTags($html);
}
if (\strpos($html, '<svg') !== false) {
$this->keepSpecialSvgTags($html);
}
$html = \str_replace(
\array_map(static function ($e) {
return '<' . $e . '>';
}, $this->selfClosingTags),
\array_map(static function ($e) {
return '<' . $e . '/>';
}, $this->selfClosingTags),
$html
);
$internalErrors = \libxml_use_internal_errors(true);
if (\PHP_VERSION_ID < 80000) {
$disableEntityLoader = \libxml_disable_entity_loader(true);
}
\libxml_clear_errors();
$optionsXml = 0;
if ($useDefaultLibXMLOptions) {
$optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
if (\defined('LIBXML_BIGLINES')) {
$optionsXml |= \LIBXML_BIGLINES;
}
if (\defined('LIBXML_COMPACT')) {
$optionsXml |= \LIBXML_COMPACT;
}
if (\defined('LIBXML_HTML_NODEFDTD')) {
$optionsXml |= \LIBXML_HTML_NODEFDTD;
}
}
if ($libXMLExtraOptions !== null) {
$optionsXml |= $libXMLExtraOptions;
}
if (
$this->isDOMDocumentCreatedWithoutHtmlWrapper
&&
$this->isDOMDocumentCreatedWithoutBodyWrapper
) {
$this->isDOMDocumentCreatedWithMultiRoot = $this->hasMultipleTopLevelNodes($html, $optionsXml);
}
if (
$this->isDOMDocumentCreatedWithMultiRoot
||
$this->isDOMDocumentCreatedWithEdgeWhitespace
||
$this->isDOMDocumentCreatedWithoutWrapper
||
$this->isDOMDocumentCreatedWithCommentWrapper
||
(
!$isDOMDocumentCreatedWithDoctype
&&
$this->keepBrokenHtml
)
) {
$html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
}
$html = self::replaceToPreserveHtmlEntities($html);
$documentFound = false;
$sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
$domElementTmp = \dom_import_simplexml($sxe);
if ($domElementTmp->ownerDocument instanceof \DOMDocument) {
$documentFound = true;
$this->document = $domElementTmp->ownerDocument;
}
}
if ($documentFound === false) {
$xmlHackUsed = false;
if (\stripos('<?xml', $html) !== 0) {
$xmlHackUsed = true;
$html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
}
if ($html !== '') {
$this->document->loadHTML($html, $optionsXml);
}
if ($xmlHackUsed) {
foreach ($this->document->childNodes as $child) {
if ($child->nodeType === \XML_PI_NODE) {
$this->document->removeChild($child);
break;
}
}
}
}
$this->markSyntheticParagraphWrapper();
$this->document->encoding = $this->getEncoding();
\libxml_clear_errors();
\libxml_use_internal_errors($internalErrors);
if (\PHP_VERSION_ID < 80000 && isset($disableEntityLoader)) {
\libxml_disable_entity_loader($disableEntityLoader);
}
return $this->document;
}
public function find(string $selector, $idx = null)
{
return $this->findInNodeContext($selector, null, $idx);
}
public function findInNodeContext(string $selector, ?\DOMNode $contextNode = null, $idx = null)
{
return self::findInDocumentContext(
$selector,
$this->document,
$contextNode,
$idx,
$this->callbackXPathBeforeQuery,
$this
);
}
public static function findInDocumentContext(
string $selector,
\DOMDocument $document,
?\DOMNode $contextNode = null,
$idx = null,
?callable $callbackXPathBeforeQuery = null,
?self $queryHtmlDomParser = null
) {
$xPathQuery = SelectorConverter::toXPath($selector);
$xPath = new \DOMXPath($document);
if ($callbackXPathBeforeQuery !== null && $queryHtmlDomParser !== null) {
$xPathQuery = \call_user_func($callbackXPathBeforeQuery, $selector, $xPathQuery, $xPath, $queryHtmlDomParser);
}
if ($contextNode !== null) {
$xPathQuery = self::scopeXPathQueryToContextNode($xPathQuery);
}
$nodesList = $xPath->query($xPathQuery, $contextNode);
return self::createFindResultFromNodeList($nodesList, $idx, $queryHtmlDomParser);
}
private static function scopeXPathQueryToContextNode(string $xPathQuery): string
{
$scopedXPathQuery = '';
$quoteCharacter = null;
$bracketDepth = 0;
$parenthesisDepth = 0;
$isAtBranchStart = true;
$length = \strlen($xPathQuery);
for ($i = 0; $i < $length; ++$i) {
$character = $xPathQuery[$i];
if ($quoteCharacter !== null) {
$scopedXPathQuery .= $character;
if ($character === $quoteCharacter) {
$quoteCharacter = null;
}
continue;
}
if ($character === '"' || $character === "'") {
$scopedXPathQuery .= $character;
$quoteCharacter = $character;
continue;
}
if ($isAtBranchStart) {
if (\trim($character) === '') {
$scopedXPathQuery .= $character;
continue;
}
if ($character === '/') {
$scopedXPathQuery .= '.';
}
$isAtBranchStart = false;
}
if ($character === '[') {
++$bracketDepth;
} elseif ($character === ']' && $bracketDepth > 0) {
--$bracketDepth;
} elseif ($character === '(') {
++$parenthesisDepth;
} elseif ($character === ')' && $parenthesisDepth > 0) {
--$parenthesisDepth;
}
$scopedXPathQuery .= $character;
if ($character === '|' && $bracketDepth === 0 && $parenthesisDepth === 0) {
$isAtBranchStart = true;
}
}
return $scopedXPathQuery;
}
private static function createFindResultFromNodeList($nodesList, $idx, ?self $queryHtmlDomParser = null)
{
$elements = new SimpleHtmlDomNode();
if ($nodesList) {
foreach ($nodesList as $node) {
if (!$node instanceof \DOMNode) {
continue;
}
$elements[] = new SimpleHtmlDom($node, $queryHtmlDomParser);
}
}
if ($idx === null) {
if (\count($elements) === 0) {
return new SimpleHtmlDomNodeBlank();
}
return $elements;
}
if ($idx < 0) {
$idx = \count($elements) + $idx;
}
return $elements[$idx] ?? new SimpleHtmlDomBlank();
}
public function findMulti(string $selector): SimpleHtmlDomNodeInterface
{
$return = $this->find($selector, null);
return $return;
}
public function findMultiOrFalse(string $selector)
{
$return = $this->find($selector, null);
if ($return instanceof SimpleHtmlDomNodeBlank) {
return false;
}
return $return;
}
public function findMultiOrNull(string $selector)
{
$return = $this->find($selector, null);
if ($return instanceof SimpleHtmlDomNodeBlank) {
return null;
}
return $return;
}
public function findOne(string $selector): SimpleHtmlDomInterface
{
$return = $this->find($selector, 0);
return $return;
}
public function findOneOrFalse(string $selector)
{
$return = $this->find($selector, 0);
if ($return instanceof SimpleHtmlDomBlank) {
return false;
}
return $return;
}
public function findOneOrNull(string $selector)
{
$return = $this->find($selector, 0);
if ($return instanceof SimpleHtmlDomBlank) {
return null;
}
return $return;
}
public function fixHtmlOutput(
string $content,
bool $multiDecodeNewHtmlEntity = false,
bool $putBrokenReplacedBack = true
): string {
if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
$content = \str_replace(
[
'<html>',
'</html>',
],
'',
$content
);
}
if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
$content = \str_replace(
[
'<head>',
'</head>',
],
'',
$content
);
}
if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
$content = \str_replace(
[
'<body>',
'</body>',
],
'',
$content
);
}
if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
$content = \str_replace(
'</script>',
'',
$content
);
}
if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
$content = (string) \preg_replace('/^<p>/', '', $content);
$content = (string) \preg_replace('/<\/p>/', '', $content);
}
if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
$content = \str_replace(
'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
'',
$content
);
}
$content = \str_replace(
\array_map(static function ($e) {
return '</' . $e . '>';
}, $this->selfClosingTags),
'',
$content
);
$content = \trim(
\str_replace(
[
'<simpleHtmlDomHtml>',
'</simpleHtmlDomHtml>',
'<simpleHtmlDomP>',
'</simpleHtmlDomP>',
'<head><head>',
'</head></head>',
],
[
'',
'',
'',
'',
'<head>',
'</head>',
],
$content
)
);
$content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
return self::putReplacedBackToPreserveHtmlEntities($content, $putBrokenReplacedBack);
}
public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
{
return $this->findMulti('.' . $class);
}
public function getElementById(string $id): SimpleHtmlDomInterface
{
return $this->findOne('#' . $id);
}
public function getElementByTagName(string $name): SimpleHtmlDomInterface
{
$node = $this->document->getElementsByTagName($name)->item(0);
if ($node === null) {
return new SimpleHtmlDomBlank();
}
return new SimpleHtmlDom($node, $this);
}
public function getElementsById(string $id, $idx = null)
{
return $this->find('#' . $id, $idx);
}
public function getElementsByTagName(string $name, $idx = null)
{
$nodesList = $this->document->getElementsByTagName($name);
$elements = new SimpleHtmlDomNode();
foreach ($nodesList as $node) {
$elements[] = new SimpleHtmlDom($node, $this);
}
if ($idx === null) {
if (\count($elements) === 0) {
return new SimpleHtmlDomNodeBlank();
}
return $elements;
}
if ($idx < 0) {
$idx = \count($elements) + $idx;
}
return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
}
public function html(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
{
if (static::$callback !== null) {
\call_user_func(static::$callback, [$this]);
}
if ($this->shouldUseWholeDocumentSerializationForHtmlOnPhpLt8()) {
$content = $this->document->saveHTML();
} elseif ($this->usesInternalWrapperDocument()) {
$content = $this->serializeInternalWrapperContent();
} elseif ($this->createdFromNode) {
if (\PHP_VERSION_ID < 80000) {
$content = $this->serializeCreatedFromNodeForPhpLt8();
} else {
$content = $this->serializeChildNodes($this->document);
}
} elseif ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
$content = $this->document->saveHTML($this->document->documentElement);
} else {
$content = $this->document->saveHTML();
}
if ($content === false) {
return '';
}
$output = $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
return $output;
}
private function markSyntheticParagraphWrapper(): void
{
if (!$this->isDOMDocumentCreatedWithoutPTagWrapper) {
return;
}
$html = $this->document->documentElement;
if (
!$html instanceof \DOMElement
||
\strtolower($html->tagName) !== 'html'
) {
return;
}
$body = $this->document->getElementsByTagName('body')->item(0);
if (!$body instanceof \DOMElement) {
return;
}
$wrapper = null;
foreach ($body->childNodes as $child) {
if ($child instanceof \DOMText && \trim($child->nodeValue ?? '') === '') {
continue;
}
if ($wrapper !== null) {
return;
}
if (!$child instanceof \DOMElement) {
return;
}
if (\strtolower($child->tagName) !== 'p') {
return;
}
$wrapper = $child;
}
if (!$wrapper instanceof \DOMElement || $wrapper->parentNode === null) {
return;
}
$replacement = $this->document->createElement('simpleHtmlDomP');
while ($wrapper->firstChild !== null) {
$replacement->appendChild($wrapper->firstChild);
}
$wrapper->parentNode->replaceChild($replacement, $wrapper);
}
private function serializeNode(\DOMNode $node): string
{
$useOwnerDoc = \PHP_VERSION_ID < 80000
&& $node instanceof \DOMElement
&& \in_array(\strtolower($node->tagName), ['script', 'style'], true);
if (!$useOwnerDoc) {
$document = new \DOMDocument('1.0', $this->getEncoding());
$document->preserveWhiteSpace = true;
$document->formatOutput = false;
$importedNode = $document->importNode($node, true);
if (!$importedNode instanceof \DOMNode) {
return '';
}
$document->appendChild($importedNode);
$content = $document->saveHTML($importedNode);
} else {
$ownerDoc = $node->ownerDocument;
$content = $ownerDoc !== null ? $ownerDoc->saveHTML($node) : false;
if ($content !== false && \substr($content, -1) === "\n") {
$content = \substr($content, 0, -1);
}
}
if ($content === false) {
return '';
}
return $content;
}
private function serializeCreatedFromNodeForPhpLt8(): string
{
$full = $this->document->saveHTML();
if ($full === false) {
return '';
}
$full = (string) \preg_replace('/<!DOCTYPE[^>]+>/i', '', $full);
$full = \trim($full);
$documentElement = $this->document->documentElement;
$tagName = $documentElement instanceof \DOMElement
? \strtolower($documentElement->tagName)
: '';
if ($tagName !== 'html') {
$full = (string) \preg_replace('/^<html[^>]*>/i', '', $full);
$full = (string) \preg_replace('/<\/html>$/i', '', $full);
$full = \trim($full);
if ($tagName !== 'body') {
$full = (string) \preg_replace('/^<body[^>]*>/i', '', $full);
$full = (string) \preg_replace('/<\/body>$/i', '', $full);
$full = \str_replace('<body></body>', '', $full);
$full = \trim($full);
}
}
return $full;
}
private function serializeChildNodes(\DOMNode $parentNode): string
{
$content = '';
foreach ($parentNode->childNodes as $childNode) {
$content .= $this->serializeNode($childNode);
}
return $content;
}
private function usesInternalWrapperDocument(): bool
{
return $this->document->documentElement instanceof \DOMElement
&& $this->document->documentElement->tagName === self::$domHtmlWrapperHelper;
}
private function isBodyOnlyHtmlFragmentDocument(): bool
{
$documentElement = $this->document->documentElement;
if (!$documentElement instanceof \DOMElement || \strtolower($documentElement->tagName) !== 'html') {
return false;
}
$head = $documentElement->getElementsByTagName('head')->item(0);
$body = $documentElement->getElementsByTagName('body')->item(0);
$hasHeadContent = $head instanceof \DOMElement && $head->childNodes->length > 0;
$hasBodyContent = $body instanceof \DOMElement && $body->childNodes->length > 0;
return !$hasHeadContent && $hasBodyContent;
}
private function shouldUseWholeDocumentSerializationForHtmlOnPhpLt8(): bool
{
if (\PHP_VERSION_ID >= 80000) {
return false;
}
if ($this->usesInternalWrapperDocument()) {
return true;
}
if (!$this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
return false;
}
$documentElement = $this->document->documentElement;
if (!$documentElement instanceof \DOMElement) {
return false;
}
return \strtolower($documentElement->tagName) !== 'html'
|| $this->isBodyOnlyHtmlFragmentDocument();
}
private function shouldUseWholeDocumentSerializationForInnerHtmlOnPhpLt8(): bool
{
return \PHP_VERSION_ID < 80000
&& (
$this->usesInternalWrapperDocument()
|| $this->isBodyOnlyHtmlFragmentDocument()
);
}
private function serializeInternalWrapperContent(): string
{
if ($this->document->documentElement === null) {
return '';
}
$wrapperTag = self::$domHtmlWrapperHelper;
return '<' . $wrapperTag . '>'
. $this->serializeChildNodes($this->document->documentElement)
. '</' . $wrapperTag . '>';
}
private function hasMultipleTopLevelNodes(string $html, int $optionsXml): bool
{
$internalErrors = \libxml_use_internal_errors(true);
try {
\libxml_clear_errors();
$xmlProbe = '<' . self::$domHtmlWrapperHelper . '>'
. self::replaceToPreserveHtmlEntities($html)
. '</' . self::$domHtmlWrapperHelper . '>';
$simpleXml = \simplexml_load_string($xmlProbe, \SimpleXMLElement::class, $optionsXml);
if ($simpleXml === false || \count(\libxml_get_errors()) > 0) {
return false;
}
$wrapper = \dom_import_simplexml($simpleXml);
if (!$wrapper instanceof \DOMElement) {
return false;
}
return $this->countSignificantChildNodes($wrapper) > 1;
} finally {
\libxml_clear_errors();
\libxml_use_internal_errors($internalErrors);
}
}
private function countSignificantChildNodes(\DOMNode $node): int
{
$count = 0;
foreach ($node->childNodes as $childNode) {
if (
$childNode->nodeType === \XML_TEXT_NODE
&&
\trim($childNode->textContent) === ''
) {
continue;
}
++$count;
if ($count > 1) {
return $count;
}
}
return $count;
}
public function innerHtml(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
{
$text = '';
if ($this->document->documentElement) {
if ($this->shouldUseWholeDocumentSerializationForInnerHtmlOnPhpLt8()) {
$text = $this->document->saveHTML();
} elseif ($this->usesInternalWrapperDocument()) {
$text = $this->serializeInternalWrapperContent();
} else {
$text = $this->serializeChildNodes($this->document->documentElement);
}
}
if ($text === false) {
$text = '';
}
$output = $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
return $output;
}
public function text(bool $multiDecodeNewHtmlEntity = false): string
{
$parts = [];
$xPath = new \DOMXPath($this->document);
$textNodes = $xPath->query(
\sprintf(
'//text()[not(ancestor::script or ancestor::style or ancestor::%s)]',
self::$domHtmlSpecialScriptHelper
)
);
if ($textNodes !== false) {
foreach ($textNodes as $textNode) {
$parts[] = $textNode->nodeValue;
}
}
return $this->fixHtmlOutput(\implode('', $parts), $multiDecodeNewHtmlEntity);
}
public function loadHtml(string $html, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): DomParserInterface
{
$this->document = $this->createDOMDocument($html, $libXMLExtraOptions, $useDefaultLibXMLOptions);
return $this;
}
public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): DomParserInterface
{
if (!\preg_match("/^https?:\/\//i", $filePath)) {
if (!\file_exists($filePath)) {
throw new \RuntimeException('File ' . $filePath . ' not found');
}
if (!\is_file($filePath)) {
throw new \RuntimeException('Could not load file ' . $filePath);
}
}
try {
if (\class_exists('\voku\helper\UTF8')) {
$html = \voku\helper\UTF8::file_get_contents($filePath);
} else {
$html = \file_get_contents($filePath);
}
} catch (\Exception $e) {
throw new \RuntimeException('Could not load file ' . $filePath);
}
if ($html === false) {
throw new \RuntimeException('Could not load file ' . $filePath);
}
return $this->loadHtml($html, $libXMLExtraOptions, $useDefaultLibXMLOptions);
}
public function xml(
bool $multiDecodeNewHtmlEntity = false,
bool $htmlToXml = true,
bool $removeXmlHeader = true,
int $options = \LIBXML_NOEMPTYTAG
): string {
$xml = $this->document->saveXML(null, $options);
if ($xml === false) {
return '';
}
if ($removeXmlHeader) {
$xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
}
if ($htmlToXml) {
$return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
} else {
$xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
$return = self::putReplacedBackToPreserveHtmlEntities($xml);
}
return $return;
}
public function __invoke($selector, $idx = null)
{
return $this->find($selector, $idx);
}
public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
{
return $this->isDOMDocumentCreatedWithoutHeadWrapper;
}
public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
{
return $this->isDOMDocumentCreatedWithoutPTagWrapper;
}
public function getIsDOMDocumentCreatedWithoutHtml(): bool
{
return $this->isDOMDocumentCreatedWithoutHtml;
}
public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
{
return $this->isDOMDocumentCreatedWithoutBodyWrapper;
}
public function getIsDOMDocumentCreatedWithMultiRoot(): bool
{
return $this->isDOMDocumentCreatedWithMultiRoot;
}
public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
{
return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
}
public function getIsDOMDocumentCreatedWithoutWrapper(): bool
{
return $this->isDOMDocumentCreatedWithoutWrapper;
}
public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
{
return $this->isDOMDocumentCreatedWithFakeEndScript;
}
protected function keepBrokenHtml(string $html): string
{
do {
$original = $html;
$html = (string) \preg_replace_callback(
'/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
static function ($matches) {
return $matches['start'] .
'°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
$matches['value'] .
'°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
$matches['end'];
},
$html
);
} while ($original !== $html);
do {
$original = $html;
$html = (string) \preg_replace_callback(
'/(?<start>[^<]*)?(?<broken>(?:<\/\w+(?:\s+\w+=\"[^"]+\")*+[^<]+>)+)(?<end>.*)/u',
function ($matches) {
$matches['broken'] = \str_replace(
['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
['</', '<', '>'],
$matches['broken']
);
$matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
$this->registerDynamicDomBrokenReplaceHelper($matches['broken'], $matchesHash);
return $matches['start'] . $matchesHash . $matches['end'];
},
$html
);
} while ($original !== $html);
return \str_replace(
['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
['</', '<', '>'],
$html
);
}
protected function keepSpecialSvgTags(string &$html)
{
$regExSpecialSvg = '/\((["\'])?(?<start>data:image\/svg.*)<svg(?<attr>[^>]*?)>(?<content>.*)<\/svg>\1\)/isU';
$htmlTmp = \preg_replace_callback(
$regExSpecialSvg,
function ($svgs) {
$content = '<svg' . $svgs['attr'] . '>' . $svgs['content'] . '</svg>';
$matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($content);
$this->registerDynamicDomBrokenReplaceHelper($content, $matchesHash);
return '(' . $svgs[1] . $svgs['start'] . $matchesHash . $svgs[1] . ')';
},
$html
);
if ($htmlTmp !== null) {
$html = $htmlTmp;
}
}
protected function keepSpecialScriptTags(string &$html)
{
$tags = \implode('|', \array_map(
static function ($value) {
return \preg_quote($value, '/');
},
$this->specialScriptTags
));
$html = (string) \preg_replace_callback(
'/(?<start>(<script [^>]*type=["\']?(?:' . $tags . ')+[^>]*>))(?<innerContent>.*)(?<end><\/script>)/isU',
function ($matches) {
foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
$matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
$matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['innerContent']);
$this->registerDynamicDomBrokenReplaceHelper($matches['innerContent'], $matchesHash);
return $matches['start'] . $matchesHash . $matches['end'];
}
}
$matches[0] = \str_replace('<\/', '</', $matches[0]);
$specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
},
$html
);
}
public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
{
$this->keepBrokenHtml = $keepBrokenHtml;
return $this;
}
public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
{
foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
if (!\is_string($tmp)) {
throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
}
}
$this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
return $this;
}
public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
{
foreach ($specialScriptTags as $tag) {
if (!\is_string($tag)) {
throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
}
}
$this->specialScriptTags = $specialScriptTags;
return $this;
}
public function setCallbackXPathBeforeQuery(callable $callbackXPathBeforeQuery): self
{
$this->callbackXPathBeforeQuery = $callbackXPathBeforeQuery;
return $this;
}
public function setCallbackBeforeCreateDom(callable $callbackBeforeCreateDom): self
{
$this->callbackBeforeCreateDom = $callbackBeforeCreateDom;
return $this;
}
}