Crawler.php 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Masterminds\HTML5;
  12. use Symfony\Component\CssSelector\CssSelectorConverter;
  13. /**
  14. * Crawler eases navigation of a list of \DOMNode objects.
  15. *
  16. * @author Fabien Potencier <fabien@symfony.com>
  17. */
  18. class Crawler implements \Countable, \IteratorAggregate
  19. {
  20. /**
  21. * @var string|null
  22. */
  23. protected $uri;
  24. /**
  25. * The default namespace prefix to be used with XPath and CSS expressions.
  26. *
  27. * @var string
  28. */
  29. private $defaultNamespacePrefix = 'default';
  30. /**
  31. * A map of manually registered namespaces.
  32. *
  33. * @var array<string, string>
  34. */
  35. private $namespaces = [];
  36. /**
  37. * The base href value.
  38. *
  39. * @var string|null
  40. */
  41. private $baseHref;
  42. /**
  43. * @var \DOMDocument|null
  44. */
  45. private $document;
  46. /**
  47. * @var \DOMNode[]
  48. */
  49. private $nodes = [];
  50. /**
  51. * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath).
  52. *
  53. * @var bool
  54. */
  55. private $isHtml = true;
  56. /**
  57. * @var HTML5|null
  58. */
  59. private $html5Parser;
  60. /**
  61. * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A Node to use as the base for the crawling
  62. */
  63. public function __construct($node = null, string $uri = null, string $baseHref = null)
  64. {
  65. $this->uri = $uri;
  66. $this->baseHref = $baseHref ?: $uri;
  67. $this->html5Parser = class_exists(HTML5::class) ? new HTML5(['disable_html_ns' => true]) : null;
  68. $this->add($node);
  69. }
  70. /**
  71. * Returns the current URI.
  72. *
  73. * @return string|null
  74. */
  75. public function getUri()
  76. {
  77. return $this->uri;
  78. }
  79. /**
  80. * Returns base href.
  81. *
  82. * @return string|null
  83. */
  84. public function getBaseHref()
  85. {
  86. return $this->baseHref;
  87. }
  88. /**
  89. * Removes all the nodes.
  90. */
  91. public function clear()
  92. {
  93. $this->nodes = [];
  94. $this->document = null;
  95. }
  96. /**
  97. * Adds a node to the current list of nodes.
  98. *
  99. * This method uses the appropriate specialized add*() method based
  100. * on the type of the argument.
  101. *
  102. * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A node
  103. *
  104. * @throws \InvalidArgumentException when node is not the expected type
  105. */
  106. public function add($node)
  107. {
  108. if ($node instanceof \DOMNodeList) {
  109. $this->addNodeList($node);
  110. } elseif ($node instanceof \DOMNode) {
  111. $this->addNode($node);
  112. } elseif (\is_array($node)) {
  113. $this->addNodes($node);
  114. } elseif (\is_string($node)) {
  115. $this->addContent($node);
  116. } elseif (null !== $node) {
  117. throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', \is_object($node) ? \get_class($node) : \gettype($node)));
  118. }
  119. }
  120. /**
  121. * Adds HTML/XML content.
  122. *
  123. * If the charset is not set via the content type, it is assumed to be UTF-8,
  124. * or ISO-8859-1 as a fallback, which is the default charset defined by the
  125. * HTTP 1.1 specification.
  126. *
  127. * @param string $content A string to parse as HTML/XML
  128. * @param string|null $type The content type of the string
  129. */
  130. public function addContent($content, $type = null)
  131. {
  132. if (empty($type)) {
  133. $type = str_starts_with($content, '<?xml') ? 'application/xml' : 'text/html';
  134. }
  135. // DOM only for HTML/XML content
  136. if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
  137. return;
  138. }
  139. $charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1';
  140. // http://www.w3.org/TR/encoding/#encodings
  141. // http://www.w3.org/TR/REC-xml/#NT-EncName
  142. $content = preg_replace_callback('/(charset *= *["\']?)([a-zA-Z\-0-9_:.]+)/i', function ($m) use (&$charset) {
  143. if ('charset=' === $this->convertToHtmlEntities('charset=', $m[2])) {
  144. $charset = $m[2];
  145. }
  146. return $m[1].$charset;
  147. }, $content, 1);
  148. if ('x' === $xmlMatches[1]) {
  149. $this->addXmlContent($content, $charset);
  150. } else {
  151. $this->addHtmlContent($content, $charset);
  152. }
  153. }
  154. /**
  155. * Adds an HTML content to the list of nodes.
  156. *
  157. * The libxml errors are disabled when the content is parsed.
  158. *
  159. * If you want to get parsing errors, be sure to enable
  160. * internal errors via libxml_use_internal_errors(true)
  161. * and then, get the errors via libxml_get_errors(). Be
  162. * sure to clear errors with libxml_clear_errors() afterward.
  163. *
  164. * @param string $content The HTML content
  165. * @param string $charset The charset
  166. */
  167. public function addHtmlContent($content, $charset = 'UTF-8')
  168. {
  169. $dom = $this->parseHtmlString($content, $charset);
  170. $this->addDocument($dom);
  171. $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
  172. $baseHref = current($base);
  173. if (\count($base) && !empty($baseHref)) {
  174. if ($this->baseHref) {
  175. $linkNode = $dom->createElement('a');
  176. $linkNode->setAttribute('href', $baseHref);
  177. $link = new Link($linkNode, $this->baseHref);
  178. $this->baseHref = $link->getUri();
  179. } else {
  180. $this->baseHref = $baseHref;
  181. }
  182. }
  183. }
  184. /**
  185. * Adds an XML content to the list of nodes.
  186. *
  187. * The libxml errors are disabled when the content is parsed.
  188. *
  189. * If you want to get parsing errors, be sure to enable
  190. * internal errors via libxml_use_internal_errors(true)
  191. * and then, get the errors via libxml_get_errors(). Be
  192. * sure to clear errors with libxml_clear_errors() afterward.
  193. *
  194. * @param string $content The XML content
  195. * @param string $charset The charset
  196. * @param int $options Bitwise OR of the libxml option constants
  197. * LIBXML_PARSEHUGE is dangerous, see
  198. * http://symfony.com/blog/security-release-symfony-2-0-17-released
  199. */
  200. public function addXmlContent($content, $charset = 'UTF-8', $options = \LIBXML_NONET)
  201. {
  202. // remove the default namespace if it's the only namespace to make XPath expressions simpler
  203. if (!preg_match('/xmlns:/', $content)) {
  204. $content = str_replace('xmlns', 'ns', $content);
  205. }
  206. $internalErrors = libxml_use_internal_errors(true);
  207. if (\LIBXML_VERSION < 20900) {
  208. $disableEntities = libxml_disable_entity_loader(true);
  209. }
  210. $dom = new \DOMDocument('1.0', $charset);
  211. $dom->validateOnParse = true;
  212. if ('' !== trim($content)) {
  213. @$dom->loadXML($content, $options);
  214. }
  215. libxml_use_internal_errors($internalErrors);
  216. if (\LIBXML_VERSION < 20900) {
  217. libxml_disable_entity_loader($disableEntities);
  218. }
  219. $this->addDocument($dom);
  220. $this->isHtml = false;
  221. }
  222. /**
  223. * Adds a \DOMDocument to the list of nodes.
  224. *
  225. * @param \DOMDocument $dom A \DOMDocument instance
  226. */
  227. public function addDocument(\DOMDocument $dom)
  228. {
  229. if ($dom->documentElement) {
  230. $this->addNode($dom->documentElement);
  231. }
  232. }
  233. /**
  234. * Adds a \DOMNodeList to the list of nodes.
  235. *
  236. * @param \DOMNodeList $nodes A \DOMNodeList instance
  237. */
  238. public function addNodeList(\DOMNodeList $nodes)
  239. {
  240. foreach ($nodes as $node) {
  241. if ($node instanceof \DOMNode) {
  242. $this->addNode($node);
  243. }
  244. }
  245. }
  246. /**
  247. * Adds an array of \DOMNode instances to the list of nodes.
  248. *
  249. * @param \DOMNode[] $nodes An array of \DOMNode instances
  250. */
  251. public function addNodes(array $nodes)
  252. {
  253. foreach ($nodes as $node) {
  254. $this->add($node);
  255. }
  256. }
  257. /**
  258. * Adds a \DOMNode instance to the list of nodes.
  259. *
  260. * @param \DOMNode $node A \DOMNode instance
  261. */
  262. public function addNode(\DOMNode $node)
  263. {
  264. if ($node instanceof \DOMDocument) {
  265. $node = $node->documentElement;
  266. }
  267. if (null !== $this->document && $this->document !== $node->ownerDocument) {
  268. throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.');
  269. }
  270. if (null === $this->document) {
  271. $this->document = $node->ownerDocument;
  272. }
  273. // Don't add duplicate nodes in the Crawler
  274. if (\in_array($node, $this->nodes, true)) {
  275. return;
  276. }
  277. $this->nodes[] = $node;
  278. }
  279. /**
  280. * Returns a node given its position in the node list.
  281. *
  282. * @param int $position The position
  283. *
  284. * @return static
  285. */
  286. public function eq($position)
  287. {
  288. if (isset($this->nodes[$position])) {
  289. return $this->createSubCrawler($this->nodes[$position]);
  290. }
  291. return $this->createSubCrawler(null);
  292. }
  293. /**
  294. * Calls an anonymous function on each node of the list.
  295. *
  296. * The anonymous function receives the position and the node wrapped
  297. * in a Crawler instance as arguments.
  298. *
  299. * Example:
  300. *
  301. * $crawler->filter('h1')->each(function ($node, $i) {
  302. * return $node->text();
  303. * });
  304. *
  305. * @param \Closure $closure An anonymous function
  306. *
  307. * @return array An array of values returned by the anonymous function
  308. */
  309. public function each(\Closure $closure)
  310. {
  311. $data = [];
  312. foreach ($this->nodes as $i => $node) {
  313. $data[] = $closure($this->createSubCrawler($node), $i);
  314. }
  315. return $data;
  316. }
  317. /**
  318. * Slices the list of nodes by $offset and $length.
  319. *
  320. * @param int $offset
  321. * @param int $length
  322. *
  323. * @return static
  324. */
  325. public function slice($offset = 0, $length = null)
  326. {
  327. return $this->createSubCrawler(\array_slice($this->nodes, $offset, $length));
  328. }
  329. /**
  330. * Reduces the list of nodes by calling an anonymous function.
  331. *
  332. * To remove a node from the list, the anonymous function must return false.
  333. *
  334. * @param \Closure $closure An anonymous function
  335. *
  336. * @return static
  337. */
  338. public function reduce(\Closure $closure)
  339. {
  340. $nodes = [];
  341. foreach ($this->nodes as $i => $node) {
  342. if (false !== $closure($this->createSubCrawler($node), $i)) {
  343. $nodes[] = $node;
  344. }
  345. }
  346. return $this->createSubCrawler($nodes);
  347. }
  348. /**
  349. * Returns the first node of the current selection.
  350. *
  351. * @return static
  352. */
  353. public function first()
  354. {
  355. return $this->eq(0);
  356. }
  357. /**
  358. * Returns the last node of the current selection.
  359. *
  360. * @return static
  361. */
  362. public function last()
  363. {
  364. return $this->eq(\count($this->nodes) - 1);
  365. }
  366. /**
  367. * Returns the siblings nodes of the current selection.
  368. *
  369. * @return static
  370. *
  371. * @throws \InvalidArgumentException When current node is empty
  372. */
  373. public function siblings()
  374. {
  375. if (!$this->nodes) {
  376. throw new \InvalidArgumentException('The current node list is empty.');
  377. }
  378. return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild));
  379. }
  380. public function matches(string $selector): bool
  381. {
  382. if (!$this->nodes) {
  383. return false;
  384. }
  385. $converter = $this->createCssSelectorConverter();
  386. $xpath = $converter->toXPath($selector, 'self::');
  387. return 0 !== $this->filterRelativeXPath($xpath)->count();
  388. }
  389. /**
  390. * Return first parents (heading toward the document root) of the Element that matches the provided selector.
  391. *
  392. * @see https://developer.mozilla.org/en-US/docs/Web/API/Element/closest#Polyfill
  393. *
  394. * @throws \InvalidArgumentException When current node is empty
  395. */
  396. public function closest(string $selector): ?self
  397. {
  398. if (!$this->nodes) {
  399. throw new \InvalidArgumentException('The current node list is empty.');
  400. }
  401. $domNode = $this->getNode(0);
  402. while (\XML_ELEMENT_NODE === $domNode->nodeType) {
  403. $node = $this->createSubCrawler($domNode);
  404. if ($node->matches($selector)) {
  405. return $node;
  406. }
  407. $domNode = $node->getNode(0)->parentNode;
  408. }
  409. return null;
  410. }
  411. /**
  412. * Returns the next siblings nodes of the current selection.
  413. *
  414. * @return static
  415. *
  416. * @throws \InvalidArgumentException When current node is empty
  417. */
  418. public function nextAll()
  419. {
  420. if (!$this->nodes) {
  421. throw new \InvalidArgumentException('The current node list is empty.');
  422. }
  423. return $this->createSubCrawler($this->sibling($this->getNode(0)));
  424. }
  425. /**
  426. * Returns the previous sibling nodes of the current selection.
  427. *
  428. * @return static
  429. *
  430. * @throws \InvalidArgumentException
  431. */
  432. public function previousAll()
  433. {
  434. if (!$this->nodes) {
  435. throw new \InvalidArgumentException('The current node list is empty.');
  436. }
  437. return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling'));
  438. }
  439. /**
  440. * Returns the parents nodes of the current selection.
  441. *
  442. * @return static
  443. *
  444. * @throws \InvalidArgumentException When current node is empty
  445. */
  446. public function parents()
  447. {
  448. if (!$this->nodes) {
  449. throw new \InvalidArgumentException('The current node list is empty.');
  450. }
  451. $node = $this->getNode(0);
  452. $nodes = [];
  453. while ($node = $node->parentNode) {
  454. if (\XML_ELEMENT_NODE === $node->nodeType) {
  455. $nodes[] = $node;
  456. }
  457. }
  458. return $this->createSubCrawler($nodes);
  459. }
  460. /**
  461. * Returns the children nodes of the current selection.
  462. *
  463. * @param string|null $selector An optional CSS selector to filter children
  464. *
  465. * @return static
  466. *
  467. * @throws \InvalidArgumentException When current node is empty
  468. * @throws \RuntimeException If the CssSelector Component is not available and $selector is provided
  469. */
  470. public function children(/* string $selector = null */)
  471. {
  472. if (\func_num_args() < 1 && __CLASS__ !== static::class && __CLASS__ !== (new \ReflectionMethod($this, __FUNCTION__))->getDeclaringClass()->getName() && !$this instanceof \PHPUnit\Framework\MockObject\MockObject && !$this instanceof \Prophecy\Prophecy\ProphecySubjectInterface && !$this instanceof \Mockery\MockInterface) {
  473. @trigger_error(sprintf('The "%s()" method will have a new "string $selector = null" argument in version 5.0, not defining it is deprecated since Symfony 4.2.', __METHOD__), \E_USER_DEPRECATED);
  474. }
  475. $selector = 0 < \func_num_args() ? func_get_arg(0) : null;
  476. if (!$this->nodes) {
  477. throw new \InvalidArgumentException('The current node list is empty.');
  478. }
  479. if (null !== $selector) {
  480. $converter = $this->createCssSelectorConverter();
  481. $xpath = $converter->toXPath($selector, 'child::');
  482. return $this->filterRelativeXPath($xpath);
  483. }
  484. $node = $this->getNode(0)->firstChild;
  485. return $this->createSubCrawler($node ? $this->sibling($node) : []);
  486. }
  487. /**
  488. * Returns the attribute value of the first node of the list.
  489. *
  490. * @param string $attribute The attribute name
  491. *
  492. * @return string|null The attribute value or null if the attribute does not exist
  493. *
  494. * @throws \InvalidArgumentException When current node is empty
  495. */
  496. public function attr($attribute)
  497. {
  498. if (!$this->nodes) {
  499. throw new \InvalidArgumentException('The current node list is empty.');
  500. }
  501. $node = $this->getNode(0);
  502. return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null;
  503. }
  504. /**
  505. * Returns the node name of the first node of the list.
  506. *
  507. * @return string The node name
  508. *
  509. * @throws \InvalidArgumentException When current node is empty
  510. */
  511. public function nodeName()
  512. {
  513. if (!$this->nodes) {
  514. throw new \InvalidArgumentException('The current node list is empty.');
  515. }
  516. return $this->getNode(0)->nodeName;
  517. }
  518. /**
  519. * Returns the text of the first node of the list.
  520. *
  521. * Pass true as the second argument to normalize whitespaces.
  522. *
  523. * @param string|null $default When not null: the value to return when the current node is empty
  524. * @param bool $normalizeWhitespace Whether whitespaces should be trimmed and normalized to single spaces
  525. *
  526. * @return string The node value
  527. *
  528. * @throws \InvalidArgumentException When current node is empty
  529. */
  530. public function text(/* string $default = null, bool $normalizeWhitespace = true */)
  531. {
  532. if (!$this->nodes) {
  533. if (0 < \func_num_args() && null !== func_get_arg(0)) {
  534. return (string) func_get_arg(0);
  535. }
  536. throw new \InvalidArgumentException('The current node list is empty.');
  537. }
  538. $text = $this->getNode(0)->nodeValue;
  539. if (\func_num_args() <= 1) {
  540. if (trim(preg_replace('/(?:\s{2,}+|[^\S ])/', ' ', $text)) !== $text) {
  541. @trigger_error(sprintf('"%s()" will normalize whitespaces by default in Symfony 5.0, set the second "$normalizeWhitespace" argument to false to retrieve the non-normalized version of the text.', __METHOD__), \E_USER_DEPRECATED);
  542. }
  543. return $text;
  544. }
  545. if (\func_num_args() > 1 && func_get_arg(1)) {
  546. return trim(preg_replace('/(?:\s{2,}+|[^\S ])/', ' ', $text));
  547. }
  548. return $text;
  549. }
  550. /**
  551. * Returns the first node of the list as HTML.
  552. *
  553. * @param string|null $default When not null: the value to return when the current node is empty
  554. *
  555. * @return string The node html
  556. *
  557. * @throws \InvalidArgumentException When current node is empty
  558. */
  559. public function html(/* string $default = null */)
  560. {
  561. if (!$this->nodes) {
  562. if (0 < \func_num_args() && null !== func_get_arg(0)) {
  563. return (string) func_get_arg(0);
  564. }
  565. throw new \InvalidArgumentException('The current node list is empty.');
  566. }
  567. $node = $this->getNode(0);
  568. $owner = $node->ownerDocument;
  569. if (null !== $this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
  570. $owner = $this->html5Parser;
  571. }
  572. $html = '';
  573. foreach ($node->childNodes as $child) {
  574. $html .= $owner->saveHTML($child);
  575. }
  576. return $html;
  577. }
  578. public function outerHtml(): string
  579. {
  580. if (!\count($this)) {
  581. throw new \InvalidArgumentException('The current node list is empty.');
  582. }
  583. $node = $this->getNode(0);
  584. $owner = $node->ownerDocument;
  585. if (null !== $this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
  586. $owner = $this->html5Parser;
  587. }
  588. return $owner->saveHTML($node);
  589. }
  590. /**
  591. * Evaluates an XPath expression.
  592. *
  593. * Since an XPath expression might evaluate to either a simple type or a \DOMNodeList,
  594. * this method will return either an array of simple types or a new Crawler instance.
  595. *
  596. * @param string $xpath An XPath expression
  597. *
  598. * @return array|Crawler An array of evaluation results or a new Crawler instance
  599. */
  600. public function evaluate($xpath)
  601. {
  602. if (null === $this->document) {
  603. throw new \LogicException('Cannot evaluate the expression on an uninitialized crawler.');
  604. }
  605. $data = [];
  606. $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath));
  607. foreach ($this->nodes as $node) {
  608. $data[] = $domxpath->evaluate($xpath, $node);
  609. }
  610. if (isset($data[0]) && $data[0] instanceof \DOMNodeList) {
  611. return $this->createSubCrawler($data);
  612. }
  613. return $data;
  614. }
  615. /**
  616. * Extracts information from the list of nodes.
  617. *
  618. * You can extract attributes or/and the node value (_text).
  619. *
  620. * Example:
  621. *
  622. * $crawler->filter('h1 a')->extract(['_text', 'href']);
  623. *
  624. * @param array $attributes An array of attributes
  625. *
  626. * @return array An array of extracted values
  627. */
  628. public function extract($attributes)
  629. {
  630. $attributes = (array) $attributes;
  631. $count = \count($attributes);
  632. $data = [];
  633. foreach ($this->nodes as $node) {
  634. $elements = [];
  635. foreach ($attributes as $attribute) {
  636. if ('_text' === $attribute) {
  637. $elements[] = $node->nodeValue;
  638. } elseif ('_name' === $attribute) {
  639. $elements[] = $node->nodeName;
  640. } else {
  641. $elements[] = $node->getAttribute($attribute);
  642. }
  643. }
  644. $data[] = 1 === $count ? $elements[0] : $elements;
  645. }
  646. return $data;
  647. }
  648. /**
  649. * Filters the list of nodes with an XPath expression.
  650. *
  651. * The XPath expression is evaluated in the context of the crawler, which
  652. * is considered as a fake parent of the elements inside it.
  653. * This means that a child selector "div" or "./div" will match only
  654. * the div elements of the current crawler, not their children.
  655. *
  656. * @param string $xpath An XPath expression
  657. *
  658. * @return static
  659. */
  660. public function filterXPath($xpath)
  661. {
  662. $xpath = $this->relativize($xpath);
  663. // If we dropped all expressions in the XPath while preparing it, there would be no match
  664. if ('' === $xpath) {
  665. return $this->createSubCrawler(null);
  666. }
  667. return $this->filterRelativeXPath($xpath);
  668. }
  669. /**
  670. * Filters the list of nodes with a CSS selector.
  671. *
  672. * This method only works if you have installed the CssSelector Symfony Component.
  673. *
  674. * @param string $selector A CSS selector
  675. *
  676. * @return static
  677. *
  678. * @throws \RuntimeException if the CssSelector Component is not available
  679. */
  680. public function filter($selector)
  681. {
  682. $converter = $this->createCssSelectorConverter();
  683. // The CssSelector already prefixes the selector with descendant-or-self::
  684. return $this->filterRelativeXPath($converter->toXPath($selector));
  685. }
  686. /**
  687. * Selects links by name or alt value for clickable images.
  688. *
  689. * @param string $value The link text
  690. *
  691. * @return static
  692. */
  693. public function selectLink($value)
  694. {
  695. return $this->filterRelativeXPath(
  696. sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %1$s) or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %1$s)]]', static::xpathLiteral(' '.$value.' '))
  697. );
  698. }
  699. /**
  700. * Selects images by alt value.
  701. *
  702. * @param string $value The image alt
  703. *
  704. * @return static A new instance of Crawler with the filtered list of nodes
  705. */
  706. public function selectImage($value)
  707. {
  708. $xpath = sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value));
  709. return $this->filterRelativeXPath($xpath);
  710. }
  711. /**
  712. * Selects a button by name or alt value for images.
  713. *
  714. * @param string $value The button text
  715. *
  716. * @return static
  717. */
  718. public function selectButton($value)
  719. {
  720. return $this->filterRelativeXPath(
  721. sprintf('descendant-or-self::input[((contains(%1$s, "submit") or contains(%1$s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %2$s)) or (contains(%1$s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %2$s)) or @id=%3$s or @name=%3$s] | descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %2$s) or @id=%3$s or @name=%3$s]', 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value))
  722. );
  723. }
  724. /**
  725. * Returns a Link object for the first node in the list.
  726. *
  727. * @param string $method The method for the link (get by default)
  728. *
  729. * @return Link A Link instance
  730. *
  731. * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
  732. */
  733. public function link($method = 'get')
  734. {
  735. if (!$this->nodes) {
  736. throw new \InvalidArgumentException('The current node list is empty.');
  737. }
  738. $node = $this->getNode(0);
  739. if (!$node instanceof \DOMElement) {
  740. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', \get_class($node)));
  741. }
  742. return new Link($node, $this->baseHref, $method);
  743. }
  744. /**
  745. * Returns an array of Link objects for the nodes in the list.
  746. *
  747. * @return Link[] An array of Link instances
  748. *
  749. * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances
  750. */
  751. public function links()
  752. {
  753. $links = [];
  754. foreach ($this->nodes as $node) {
  755. if (!$node instanceof \DOMElement) {
  756. throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', \get_class($node)));
  757. }
  758. $links[] = new Link($node, $this->baseHref, 'get');
  759. }
  760. return $links;
  761. }
  762. /**
  763. * Returns an Image object for the first node in the list.
  764. *
  765. * @return Image An Image instance
  766. *
  767. * @throws \InvalidArgumentException If the current node list is empty
  768. */
  769. public function image()
  770. {
  771. if (!\count($this)) {
  772. throw new \InvalidArgumentException('The current node list is empty.');
  773. }
  774. $node = $this->getNode(0);
  775. if (!$node instanceof \DOMElement) {
  776. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', \get_class($node)));
  777. }
  778. return new Image($node, $this->baseHref);
  779. }
  780. /**
  781. * Returns an array of Image objects for the nodes in the list.
  782. *
  783. * @return Image[] An array of Image instances
  784. */
  785. public function images()
  786. {
  787. $images = [];
  788. foreach ($this as $node) {
  789. if (!$node instanceof \DOMElement) {
  790. throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', \get_class($node)));
  791. }
  792. $images[] = new Image($node, $this->baseHref);
  793. }
  794. return $images;
  795. }
  796. /**
  797. * Returns a Form object for the first node in the list.
  798. *
  799. * @param array $values An array of values for the form fields
  800. * @param string $method The method for the form
  801. *
  802. * @return Form A Form instance
  803. *
  804. * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
  805. */
  806. public function form(array $values = null, $method = null)
  807. {
  808. if (!$this->nodes) {
  809. throw new \InvalidArgumentException('The current node list is empty.');
  810. }
  811. $node = $this->getNode(0);
  812. if (!$node instanceof \DOMElement) {
  813. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', \get_class($node)));
  814. }
  815. $form = new Form($node, $this->uri, $method, $this->baseHref);
  816. if (null !== $values) {
  817. $form->setValues($values);
  818. }
  819. return $form;
  820. }
  821. /**
  822. * Overloads a default namespace prefix to be used with XPath and CSS expressions.
  823. *
  824. * @param string $prefix
  825. */
  826. public function setDefaultNamespacePrefix($prefix)
  827. {
  828. $this->defaultNamespacePrefix = $prefix;
  829. }
  830. /**
  831. * @param string $prefix
  832. * @param string $namespace
  833. */
  834. public function registerNamespace($prefix, $namespace)
  835. {
  836. $this->namespaces[$prefix] = $namespace;
  837. }
  838. /**
  839. * Converts string for XPath expressions.
  840. *
  841. * Escaped characters are: quotes (") and apostrophe (').
  842. *
  843. * Examples:
  844. *
  845. * echo Crawler::xpathLiteral('foo " bar');
  846. * //prints 'foo " bar'
  847. *
  848. * echo Crawler::xpathLiteral("foo ' bar");
  849. * //prints "foo ' bar"
  850. *
  851. * echo Crawler::xpathLiteral('a\'b"c');
  852. * //prints concat('a', "'", 'b"c')
  853. *
  854. * @param string $s String to be escaped
  855. *
  856. * @return string Converted string
  857. */
  858. public static function xpathLiteral($s)
  859. {
  860. if (!str_contains($s, "'")) {
  861. return sprintf("'%s'", $s);
  862. }
  863. if (!str_contains($s, '"')) {
  864. return sprintf('"%s"', $s);
  865. }
  866. $string = $s;
  867. $parts = [];
  868. while (true) {
  869. if (false !== $pos = strpos($string, "'")) {
  870. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  871. $parts[] = "\"'\"";
  872. $string = substr($string, $pos + 1);
  873. } else {
  874. $parts[] = "'$string'";
  875. break;
  876. }
  877. }
  878. return sprintf('concat(%s)', implode(', ', $parts));
  879. }
  880. /**
  881. * Filters the list of nodes with an XPath expression.
  882. *
  883. * The XPath expression should already be processed to apply it in the context of each node.
  884. *
  885. * @return static
  886. */
  887. private function filterRelativeXPath(string $xpath)
  888. {
  889. $prefixes = $this->findNamespacePrefixes($xpath);
  890. $crawler = $this->createSubCrawler(null);
  891. foreach ($this->nodes as $node) {
  892. $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes);
  893. $crawler->add($domxpath->query($xpath, $node));
  894. }
  895. return $crawler;
  896. }
  897. /**
  898. * Make the XPath relative to the current context.
  899. *
  900. * The returned XPath will match elements matching the XPath inside the current crawler
  901. * when running in the context of a node of the crawler.
  902. */
  903. private function relativize(string $xpath): string
  904. {
  905. $expressions = [];
  906. // An expression which will never match to replace expressions which cannot match in the crawler
  907. // We cannot drop
  908. $nonMatchingExpression = 'a[name() = "b"]';
  909. $xpathLen = \strlen($xpath);
  910. $openedBrackets = 0;
  911. $startPosition = strspn($xpath, " \t\n\r\0\x0B");
  912. for ($i = $startPosition; $i <= $xpathLen; ++$i) {
  913. $i += strcspn($xpath, '"\'[]|', $i);
  914. if ($i < $xpathLen) {
  915. switch ($xpath[$i]) {
  916. case '"':
  917. case "'":
  918. if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) {
  919. return $xpath; // The XPath expression is invalid
  920. }
  921. continue 2;
  922. case '[':
  923. ++$openedBrackets;
  924. continue 2;
  925. case ']':
  926. --$openedBrackets;
  927. continue 2;
  928. }
  929. }
  930. if ($openedBrackets) {
  931. continue;
  932. }
  933. if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) {
  934. // If the union is inside some braces, we need to preserve the opening braces and apply
  935. // the change only inside it.
  936. $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1);
  937. $parenthesis = substr($xpath, $startPosition, $j);
  938. $startPosition += $j;
  939. } else {
  940. $parenthesis = '';
  941. }
  942. $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition));
  943. if (str_starts_with($expression, 'self::*/')) {
  944. $expression = './'.substr($expression, 8);
  945. }
  946. // add prefix before absolute element selector
  947. if ('' === $expression) {
  948. $expression = $nonMatchingExpression;
  949. } elseif (str_starts_with($expression, '//')) {
  950. $expression = 'descendant-or-self::'.substr($expression, 2);
  951. } elseif (str_starts_with($expression, './/')) {
  952. $expression = 'descendant-or-self::'.substr($expression, 3);
  953. } elseif (str_starts_with($expression, './')) {
  954. $expression = 'self::'.substr($expression, 2);
  955. } elseif (str_starts_with($expression, 'child::')) {
  956. $expression = 'self::'.substr($expression, 7);
  957. } elseif ('/' === $expression[0] || '.' === $expression[0] || str_starts_with($expression, 'self::')) {
  958. $expression = $nonMatchingExpression;
  959. } elseif (str_starts_with($expression, 'descendant::')) {
  960. $expression = 'descendant-or-self::'.substr($expression, 12);
  961. } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) {
  962. // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes)
  963. $expression = $nonMatchingExpression;
  964. } elseif (!str_starts_with($expression, 'descendant-or-self::')) {
  965. $expression = 'self::'.$expression;
  966. }
  967. $expressions[] = $parenthesis.$expression;
  968. if ($i === $xpathLen) {
  969. return implode(' | ', $expressions);
  970. }
  971. $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1);
  972. $startPosition = $i + 1;
  973. }
  974. return $xpath; // The XPath expression is invalid
  975. }
  976. /**
  977. * @param int $position
  978. *
  979. * @return \DOMNode|null
  980. */
  981. public function getNode($position)
  982. {
  983. return $this->nodes[$position] ?? null;
  984. }
  985. /**
  986. * @return int
  987. */
  988. #[\ReturnTypeWillChange]
  989. public function count()
  990. {
  991. return \count($this->nodes);
  992. }
  993. /**
  994. * @return \ArrayIterator|\DOMNode[]
  995. */
  996. #[\ReturnTypeWillChange]
  997. public function getIterator()
  998. {
  999. return new \ArrayIterator($this->nodes);
  1000. }
  1001. /**
  1002. * @param \DOMElement $node
  1003. * @param string $siblingDir
  1004. *
  1005. * @return array
  1006. */
  1007. protected function sibling($node, $siblingDir = 'nextSibling')
  1008. {
  1009. $nodes = [];
  1010. $currentNode = $this->getNode(0);
  1011. do {
  1012. if ($node !== $currentNode && \XML_ELEMENT_NODE === $node->nodeType) {
  1013. $nodes[] = $node;
  1014. }
  1015. } while ($node = $node->$siblingDir);
  1016. return $nodes;
  1017. }
  1018. private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
  1019. {
  1020. return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset));
  1021. }
  1022. private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
  1023. {
  1024. $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
  1025. $internalErrors = libxml_use_internal_errors(true);
  1026. if (\LIBXML_VERSION < 20900) {
  1027. $disableEntities = libxml_disable_entity_loader(true);
  1028. }
  1029. $dom = new \DOMDocument('1.0', $charset);
  1030. $dom->validateOnParse = true;
  1031. if ('' !== trim($htmlContent)) {
  1032. @$dom->loadHTML($htmlContent);
  1033. }
  1034. libxml_use_internal_errors($internalErrors);
  1035. if (\LIBXML_VERSION < 20900) {
  1036. libxml_disable_entity_loader($disableEntities);
  1037. }
  1038. return $dom;
  1039. }
  1040. /**
  1041. * Converts charset to HTML-entities to ensure valid parsing.
  1042. */
  1043. private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
  1044. {
  1045. set_error_handler(function () { throw new \Exception(); });
  1046. try {
  1047. return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset);
  1048. } catch (\Exception|\ValueError $e) {
  1049. try {
  1050. $htmlContent = iconv($charset, 'UTF-8', $htmlContent);
  1051. $htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8');
  1052. } catch (\Exception|\ValueError $e) {
  1053. }
  1054. return $htmlContent;
  1055. } finally {
  1056. restore_error_handler();
  1057. }
  1058. }
  1059. /**
  1060. * @throws \InvalidArgumentException
  1061. */
  1062. private function createDOMXPath(\DOMDocument $document, array $prefixes = []): \DOMXPath
  1063. {
  1064. $domxpath = new \DOMXPath($document);
  1065. foreach ($prefixes as $prefix) {
  1066. $namespace = $this->discoverNamespace($domxpath, $prefix);
  1067. if (null !== $namespace) {
  1068. $domxpath->registerNamespace($prefix, $namespace);
  1069. }
  1070. }
  1071. return $domxpath;
  1072. }
  1073. /**
  1074. * @throws \InvalidArgumentException
  1075. */
  1076. private function discoverNamespace(\DOMXPath $domxpath, string $prefix): ?string
  1077. {
  1078. if (isset($this->namespaces[$prefix])) {
  1079. return $this->namespaces[$prefix];
  1080. }
  1081. // ask for one namespace, otherwise we'd get a collection with an item for each node
  1082. $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
  1083. return ($node = $namespaces->item(0)) ? $node->nodeValue : null;
  1084. }
  1085. private function findNamespacePrefixes(string $xpath): array
  1086. {
  1087. if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) {
  1088. return array_unique($matches['prefix']);
  1089. }
  1090. return [];
  1091. }
  1092. /**
  1093. * Creates a crawler for some subnodes.
  1094. *
  1095. * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $nodes
  1096. *
  1097. * @return static
  1098. */
  1099. private function createSubCrawler($nodes)
  1100. {
  1101. $crawler = new static($nodes, $this->uri, $this->baseHref);
  1102. $crawler->isHtml = $this->isHtml;
  1103. $crawler->document = $this->document;
  1104. $crawler->namespaces = $this->namespaces;
  1105. $crawler->html5Parser = $this->html5Parser;
  1106. return $crawler;
  1107. }
  1108. /**
  1109. * @throws \LogicException If the CssSelector Component is not available
  1110. */
  1111. private function createCssSelectorConverter(): CssSelectorConverter
  1112. {
  1113. if (!class_exists(CssSelectorConverter::class)) {
  1114. throw new \LogicException('To filter with a CSS selector, install the CssSelector component ("composer require symfony/css-selector"). Or use filterXpath instead.');
  1115. }
  1116. return new CssSelectorConverter($this->isHtml);
  1117. }
  1118. /**
  1119. * Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available.
  1120. * Use libxml parser otherwise.
  1121. */
  1122. private function parseHtmlString(string $content, string $charset): \DOMDocument
  1123. {
  1124. if ($this->canParseHtml5String($content)) {
  1125. return $this->parseHtml5($content, $charset);
  1126. }
  1127. return $this->parseXhtml($content, $charset);
  1128. }
  1129. private function canParseHtml5String(string $content): bool
  1130. {
  1131. if (null === $this->html5Parser) {
  1132. return false;
  1133. }
  1134. if (false === ($pos = stripos($content, '<!doctype html>'))) {
  1135. return false;
  1136. }
  1137. $header = substr($content, 0, $pos);
  1138. return '' === $header || $this->isValidHtml5Heading($header);
  1139. }
  1140. private function isValidHtml5Heading(string $heading): bool
  1141. {
  1142. return 1 === preg_match('/^\x{FEFF}?\s*(<!--[^>]*?-->\s*)*$/u', $heading);
  1143. }
  1144. }