Drupal investigation

Crawler.php 36KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Symfony\Component\CssSelector\CssSelectorConverter;
  12. /**
  13. * Crawler eases navigation of a list of \DOMNode objects.
  14. *
  15. * @author Fabien Potencier <fabien@symfony.com>
  16. */
  17. class Crawler extends \SplObjectStorage
  18. {
  19. /**
  20. * @var string The current URI
  21. */
  22. protected $uri;
  23. /**
  24. * @var string The default namespace prefix to be used with XPath and CSS expressions
  25. */
  26. private $defaultNamespacePrefix = 'default';
  27. /**
  28. * @var array A map of manually registered namespaces
  29. */
  30. private $namespaces = array();
  31. /**
  32. * @var string The base href value
  33. */
  34. private $baseHref;
  35. /**
  36. * @var \DOMDocument|null
  37. */
  38. private $document;
  39. /**
  40. * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath).
  41. *
  42. * @var bool
  43. */
  44. private $isHtml = true;
  45. /**
  46. * Constructor.
  47. *
  48. * @param mixed $node A Node to use as the base for the crawling
  49. * @param string $currentUri The current URI
  50. * @param string $baseHref The base href value
  51. */
  52. public function __construct($node = null, $currentUri = null, $baseHref = null)
  53. {
  54. $this->uri = $currentUri;
  55. $this->baseHref = $baseHref ?: $currentUri;
  56. $this->add($node);
  57. }
  58. /**
  59. * Removes all the nodes.
  60. */
  61. public function clear()
  62. {
  63. parent::removeAll($this);
  64. $this->document = null;
  65. }
  66. /**
  67. * Adds a node to the current list of nodes.
  68. *
  69. * This method uses the appropriate specialized add*() method based
  70. * on the type of the argument.
  71. *
  72. * @param \DOMNodeList|\DOMNode|array|string|null $node A node
  73. *
  74. * @throws \InvalidArgumentException When node is not the expected type.
  75. */
  76. public function add($node)
  77. {
  78. if ($node instanceof \DOMNodeList) {
  79. $this->addNodeList($node);
  80. } elseif ($node instanceof \DOMNode) {
  81. $this->addNode($node);
  82. } elseif (is_array($node)) {
  83. $this->addNodes($node);
  84. } elseif (is_string($node)) {
  85. $this->addContent($node);
  86. } elseif (null !== $node) {
  87. throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', is_object($node) ? get_class($node) : gettype($node)));
  88. }
  89. }
  90. /**
  91. * Adds HTML/XML content.
  92. *
  93. * If the charset is not set via the content type, it is assumed
  94. * to be ISO-8859-1, which is the default charset defined by the
  95. * HTTP 1.1 specification.
  96. *
  97. * @param string $content A string to parse as HTML/XML
  98. * @param null|string $type The content type of the string
  99. */
  100. public function addContent($content, $type = null)
  101. {
  102. if (empty($type)) {
  103. $type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html';
  104. }
  105. // DOM only for HTML/XML content
  106. if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
  107. return;
  108. }
  109. $charset = null;
  110. if (false !== $pos = stripos($type, 'charset=')) {
  111. $charset = substr($type, $pos + 8);
  112. if (false !== $pos = strpos($charset, ';')) {
  113. $charset = substr($charset, 0, $pos);
  114. }
  115. }
  116. // http://www.w3.org/TR/encoding/#encodings
  117. // http://www.w3.org/TR/REC-xml/#NT-EncName
  118. if (null === $charset &&
  119. preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) {
  120. $charset = $matches[1];
  121. }
  122. if (null === $charset) {
  123. $charset = 'ISO-8859-1';
  124. }
  125. if ('x' === $xmlMatches[1]) {
  126. $this->addXmlContent($content, $charset);
  127. } else {
  128. $this->addHtmlContent($content, $charset);
  129. }
  130. }
  131. /**
  132. * Adds an HTML content to the list of nodes.
  133. *
  134. * The libxml errors are disabled when the content is parsed.
  135. *
  136. * If you want to get parsing errors, be sure to enable
  137. * internal errors via libxml_use_internal_errors(true)
  138. * and then, get the errors via libxml_get_errors(). Be
  139. * sure to clear errors with libxml_clear_errors() afterward.
  140. *
  141. * @param string $content The HTML content
  142. * @param string $charset The charset
  143. */
  144. public function addHtmlContent($content, $charset = 'UTF-8')
  145. {
  146. $internalErrors = libxml_use_internal_errors(true);
  147. $disableEntities = libxml_disable_entity_loader(true);
  148. $dom = new \DOMDocument('1.0', $charset);
  149. $dom->validateOnParse = true;
  150. set_error_handler(function () { throw new \Exception(); });
  151. try {
  152. // Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML()
  153. $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
  154. } catch (\Exception $e) {
  155. }
  156. restore_error_handler();
  157. if ('' !== trim($content)) {
  158. @$dom->loadHTML($content);
  159. }
  160. libxml_use_internal_errors($internalErrors);
  161. libxml_disable_entity_loader($disableEntities);
  162. $this->addDocument($dom);
  163. $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href'));
  164. $baseHref = current($base);
  165. if (count($base) && !empty($baseHref)) {
  166. if ($this->baseHref) {
  167. $linkNode = $dom->createElement('a');
  168. $linkNode->setAttribute('href', $baseHref);
  169. $link = new Link($linkNode, $this->baseHref);
  170. $this->baseHref = $link->getUri();
  171. } else {
  172. $this->baseHref = $baseHref;
  173. }
  174. }
  175. }
  176. /**
  177. * Adds an XML content to the list of nodes.
  178. *
  179. * The libxml errors are disabled when the content is parsed.
  180. *
  181. * If you want to get parsing errors, be sure to enable
  182. * internal errors via libxml_use_internal_errors(true)
  183. * and then, get the errors via libxml_get_errors(). Be
  184. * sure to clear errors with libxml_clear_errors() afterward.
  185. *
  186. * @param string $content The XML content
  187. * @param string $charset The charset
  188. * @param int $options Bitwise OR of the libxml option constants
  189. * LIBXML_PARSEHUGE is dangerous, see
  190. * http://symfony.com/blog/security-release-symfony-2-0-17-released
  191. */
  192. public function addXmlContent($content, $charset = 'UTF-8', $options = LIBXML_NONET)
  193. {
  194. // remove the default namespace if it's the only namespace to make XPath expressions simpler
  195. if (!preg_match('/xmlns:/', $content)) {
  196. $content = str_replace('xmlns', 'ns', $content);
  197. }
  198. $internalErrors = libxml_use_internal_errors(true);
  199. $disableEntities = libxml_disable_entity_loader(true);
  200. $dom = new \DOMDocument('1.0', $charset);
  201. $dom->validateOnParse = true;
  202. if ('' !== trim($content)) {
  203. @$dom->loadXML($content, $options);
  204. }
  205. libxml_use_internal_errors($internalErrors);
  206. libxml_disable_entity_loader($disableEntities);
  207. $this->addDocument($dom);
  208. $this->isHtml = false;
  209. }
  210. /**
  211. * Adds a \DOMDocument to the list of nodes.
  212. *
  213. * @param \DOMDocument $dom A \DOMDocument instance
  214. */
  215. public function addDocument(\DOMDocument $dom)
  216. {
  217. if ($dom->documentElement) {
  218. $this->addNode($dom->documentElement);
  219. }
  220. }
  221. /**
  222. * Adds a \DOMNodeList to the list of nodes.
  223. *
  224. * @param \DOMNodeList $nodes A \DOMNodeList instance
  225. */
  226. public function addNodeList(\DOMNodeList $nodes)
  227. {
  228. foreach ($nodes as $node) {
  229. if ($node instanceof \DOMNode) {
  230. $this->addNode($node);
  231. }
  232. }
  233. }
  234. /**
  235. * Adds an array of \DOMNode instances to the list of nodes.
  236. *
  237. * @param \DOMNode[] $nodes An array of \DOMNode instances
  238. */
  239. public function addNodes(array $nodes)
  240. {
  241. foreach ($nodes as $node) {
  242. $this->add($node);
  243. }
  244. }
  245. /**
  246. * Adds a \DOMNode instance to the list of nodes.
  247. *
  248. * @param \DOMNode $node A \DOMNode instance
  249. */
  250. public function addNode(\DOMNode $node)
  251. {
  252. if ($node instanceof \DOMDocument) {
  253. $node = $node->documentElement;
  254. }
  255. if (null !== $this->document && $this->document !== $node->ownerDocument) {
  256. @trigger_error('Attaching DOM nodes from multiple documents in a Crawler is deprecated as of 2.8 and will be forbidden in 3.0.', E_USER_DEPRECATED);
  257. }
  258. if (null === $this->document) {
  259. $this->document = $node->ownerDocument;
  260. }
  261. parent::attach($node);
  262. }
  263. // Serializing and unserializing a crawler creates DOM objects in a corrupted state. DOM elements are not properly serializable.
  264. public function unserialize($serialized)
  265. {
  266. throw new \BadMethodCallException('A Crawler cannot be serialized.');
  267. }
  268. public function serialize()
  269. {
  270. throw new \BadMethodCallException('A Crawler cannot be serialized.');
  271. }
  272. /**
  273. * Returns a node given its position in the node list.
  274. *
  275. * @param int $position The position
  276. *
  277. * @return self
  278. */
  279. public function eq($position)
  280. {
  281. foreach ($this as $i => $node) {
  282. if ($i == $position) {
  283. return $this->createSubCrawler($node);
  284. }
  285. }
  286. return $this->createSubCrawler(null);
  287. }
  288. /**
  289. * Calls an anonymous function on each node of the list.
  290. *
  291. * The anonymous function receives the position and the node wrapped
  292. * in a Crawler instance as arguments.
  293. *
  294. * Example:
  295. *
  296. * $crawler->filter('h1')->each(function ($node, $i) {
  297. * return $node->text();
  298. * });
  299. *
  300. * @param \Closure $closure An anonymous function
  301. *
  302. * @return array An array of values returned by the anonymous function
  303. */
  304. public function each(\Closure $closure)
  305. {
  306. $data = array();
  307. foreach ($this as $i => $node) {
  308. $data[] = $closure($this->createSubCrawler($node), $i);
  309. }
  310. return $data;
  311. }
  312. /**
  313. * Slices the list of nodes by $offset and $length.
  314. *
  315. * @param int $offset
  316. * @param int $length
  317. *
  318. * @return self
  319. */
  320. public function slice($offset = 0, $length = -1)
  321. {
  322. return $this->createSubCrawler(iterator_to_array(new \LimitIterator($this, $offset, $length)));
  323. }
  324. /**
  325. * Reduces the list of nodes by calling an anonymous function.
  326. *
  327. * To remove a node from the list, the anonymous function must return false.
  328. *
  329. * @param \Closure $closure An anonymous function
  330. *
  331. * @return self
  332. */
  333. public function reduce(\Closure $closure)
  334. {
  335. $nodes = array();
  336. foreach ($this as $i => $node) {
  337. if (false !== $closure($this->createSubCrawler($node), $i)) {
  338. $nodes[] = $node;
  339. }
  340. }
  341. return $this->createSubCrawler($nodes);
  342. }
  343. /**
  344. * Returns the first node of the current selection.
  345. *
  346. * @return self
  347. */
  348. public function first()
  349. {
  350. return $this->eq(0);
  351. }
  352. /**
  353. * Returns the last node of the current selection.
  354. *
  355. * @return self
  356. */
  357. public function last()
  358. {
  359. return $this->eq(count($this) - 1);
  360. }
  361. /**
  362. * Returns the siblings nodes of the current selection.
  363. *
  364. * @return self
  365. *
  366. * @throws \InvalidArgumentException When current node is empty
  367. */
  368. public function siblings()
  369. {
  370. if (!count($this)) {
  371. throw new \InvalidArgumentException('The current node list is empty.');
  372. }
  373. return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild));
  374. }
  375. /**
  376. * Returns the next siblings nodes of the current selection.
  377. *
  378. * @return self
  379. *
  380. * @throws \InvalidArgumentException When current node is empty
  381. */
  382. public function nextAll()
  383. {
  384. if (!count($this)) {
  385. throw new \InvalidArgumentException('The current node list is empty.');
  386. }
  387. return $this->createSubCrawler($this->sibling($this->getNode(0)));
  388. }
  389. /**
  390. * Returns the previous sibling nodes of the current selection.
  391. *
  392. * @return self
  393. *
  394. * @throws \InvalidArgumentException
  395. */
  396. public function previousAll()
  397. {
  398. if (!count($this)) {
  399. throw new \InvalidArgumentException('The current node list is empty.');
  400. }
  401. return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling'));
  402. }
  403. /**
  404. * Returns the parents nodes of the current selection.
  405. *
  406. * @return self
  407. *
  408. * @throws \InvalidArgumentException When current node is empty
  409. */
  410. public function parents()
  411. {
  412. if (!count($this)) {
  413. throw new \InvalidArgumentException('The current node list is empty.');
  414. }
  415. $node = $this->getNode(0);
  416. $nodes = array();
  417. while ($node = $node->parentNode) {
  418. if (XML_ELEMENT_NODE === $node->nodeType) {
  419. $nodes[] = $node;
  420. }
  421. }
  422. return $this->createSubCrawler($nodes);
  423. }
  424. /**
  425. * Returns the children nodes of the current selection.
  426. *
  427. * @return self
  428. *
  429. * @throws \InvalidArgumentException When current node is empty
  430. */
  431. public function children()
  432. {
  433. if (!count($this)) {
  434. throw new \InvalidArgumentException('The current node list is empty.');
  435. }
  436. $node = $this->getNode(0)->firstChild;
  437. return $this->createSubCrawler($node ? $this->sibling($node) : array());
  438. }
  439. /**
  440. * Returns the attribute value of the first node of the list.
  441. *
  442. * @param string $attribute The attribute name
  443. *
  444. * @return string|null The attribute value or null if the attribute does not exist
  445. *
  446. * @throws \InvalidArgumentException When current node is empty
  447. */
  448. public function attr($attribute)
  449. {
  450. if (!count($this)) {
  451. throw new \InvalidArgumentException('The current node list is empty.');
  452. }
  453. $node = $this->getNode(0);
  454. return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null;
  455. }
  456. /**
  457. * Returns the node name of the first node of the list.
  458. *
  459. * @return string The node name
  460. *
  461. * @throws \InvalidArgumentException When current node is empty
  462. */
  463. public function nodeName()
  464. {
  465. if (!count($this)) {
  466. throw new \InvalidArgumentException('The current node list is empty.');
  467. }
  468. return $this->getNode(0)->nodeName;
  469. }
  470. /**
  471. * Returns the node value of the first node of the list.
  472. *
  473. * @return string The node value
  474. *
  475. * @throws \InvalidArgumentException When current node is empty
  476. */
  477. public function text()
  478. {
  479. if (!count($this)) {
  480. throw new \InvalidArgumentException('The current node list is empty.');
  481. }
  482. return $this->getNode(0)->nodeValue;
  483. }
  484. /**
  485. * Returns the first node of the list as HTML.
  486. *
  487. * @return string The node html
  488. *
  489. * @throws \InvalidArgumentException When current node is empty
  490. */
  491. public function html()
  492. {
  493. if (!count($this)) {
  494. throw new \InvalidArgumentException('The current node list is empty.');
  495. }
  496. $html = '';
  497. foreach ($this->getNode(0)->childNodes as $child) {
  498. $html .= $child->ownerDocument->saveHTML($child);
  499. }
  500. return $html;
  501. }
  502. /**
  503. * Extracts information from the list of nodes.
  504. *
  505. * You can extract attributes or/and the node value (_text).
  506. *
  507. * Example:
  508. *
  509. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  510. *
  511. * @param array $attributes An array of attributes
  512. *
  513. * @return array An array of extracted values
  514. */
  515. public function extract($attributes)
  516. {
  517. $attributes = (array) $attributes;
  518. $count = count($attributes);
  519. $data = array();
  520. foreach ($this as $node) {
  521. $elements = array();
  522. foreach ($attributes as $attribute) {
  523. if ('_text' === $attribute) {
  524. $elements[] = $node->nodeValue;
  525. } else {
  526. $elements[] = $node->getAttribute($attribute);
  527. }
  528. }
  529. $data[] = $count > 1 ? $elements : $elements[0];
  530. }
  531. return $data;
  532. }
  533. /**
  534. * Filters the list of nodes with an XPath expression.
  535. *
  536. * The XPath expression is evaluated in the context of the crawler, which
  537. * is considered as a fake parent of the elements inside it.
  538. * This means that a child selector "div" or "./div" will match only
  539. * the div elements of the current crawler, not their children.
  540. *
  541. * @param string $xpath An XPath expression
  542. *
  543. * @return self
  544. */
  545. public function filterXPath($xpath)
  546. {
  547. $xpath = $this->relativize($xpath);
  548. // If we dropped all expressions in the XPath while preparing it, there would be no match
  549. if ('' === $xpath) {
  550. return $this->createSubCrawler(null);
  551. }
  552. return $this->filterRelativeXPath($xpath);
  553. }
  554. /**
  555. * Filters the list of nodes with a CSS selector.
  556. *
  557. * This method only works if you have installed the CssSelector Symfony Component.
  558. *
  559. * @param string $selector A CSS selector
  560. *
  561. * @return self
  562. *
  563. * @throws \RuntimeException if the CssSelector Component is not available
  564. */
  565. public function filter($selector)
  566. {
  567. if (!class_exists('Symfony\\Component\\CssSelector\\CssSelectorConverter')) {
  568. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector 2.8+ is not installed (you can use filterXPath instead).');
  569. }
  570. $converter = new CssSelectorConverter($this->isHtml);
  571. // The CssSelector already prefixes the selector with descendant-or-self::
  572. return $this->filterRelativeXPath($converter->toXPath($selector));
  573. }
  574. /**
  575. * Selects links by name or alt value for clickable images.
  576. *
  577. * @param string $value The link text
  578. *
  579. * @return self
  580. */
  581. public function selectLink($value)
  582. {
  583. $xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')).
  584. sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' '));
  585. return $this->filterRelativeXPath($xpath);
  586. }
  587. /**
  588. * Selects a button by name or alt value for images.
  589. *
  590. * @param string $value The button text
  591. *
  592. * @return self
  593. */
  594. public function selectButton($value)
  595. {
  596. $translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")';
  597. $xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
  598. sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id=%s or @name=%s] ', $translate, static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)).
  599. sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id=%s or @name=%s]', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value));
  600. return $this->filterRelativeXPath($xpath);
  601. }
  602. /**
  603. * Returns a Link object for the first node in the list.
  604. *
  605. * @param string $method The method for the link (get by default)
  606. *
  607. * @return Link A Link instance
  608. *
  609. * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
  610. */
  611. public function link($method = 'get')
  612. {
  613. if (!count($this)) {
  614. throw new \InvalidArgumentException('The current node list is empty.');
  615. }
  616. $node = $this->getNode(0);
  617. if (!$node instanceof \DOMElement) {
  618. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node)));
  619. }
  620. return new Link($node, $this->baseHref, $method);
  621. }
  622. /**
  623. * Returns an array of Link objects for the nodes in the list.
  624. *
  625. * @return Link[] An array of Link instances
  626. *
  627. * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances
  628. */
  629. public function links()
  630. {
  631. $links = array();
  632. foreach ($this as $node) {
  633. if (!$node instanceof \DOMElement) {
  634. throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node)));
  635. }
  636. $links[] = new Link($node, $this->baseHref, 'get');
  637. }
  638. return $links;
  639. }
  640. /**
  641. * Returns a Form object for the first node in the list.
  642. *
  643. * @param array $values An array of values for the form fields
  644. * @param string $method The method for the form
  645. *
  646. * @return Form A Form instance
  647. *
  648. * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
  649. */
  650. public function form(array $values = null, $method = null)
  651. {
  652. if (!count($this)) {
  653. throw new \InvalidArgumentException('The current node list is empty.');
  654. }
  655. $node = $this->getNode(0);
  656. if (!$node instanceof \DOMElement) {
  657. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node)));
  658. }
  659. $form = new Form($node, $this->uri, $method, $this->baseHref);
  660. if (null !== $values) {
  661. $form->setValues($values);
  662. }
  663. return $form;
  664. }
  665. /**
  666. * Overloads a default namespace prefix to be used with XPath and CSS expressions.
  667. *
  668. * @param string $prefix
  669. */
  670. public function setDefaultNamespacePrefix($prefix)
  671. {
  672. $this->defaultNamespacePrefix = $prefix;
  673. }
  674. /**
  675. * @param string $prefix
  676. * @param string $namespace
  677. */
  678. public function registerNamespace($prefix, $namespace)
  679. {
  680. $this->namespaces[$prefix] = $namespace;
  681. }
  682. /**
  683. * Converts string for XPath expressions.
  684. *
  685. * Escaped characters are: quotes (") and apostrophe (').
  686. *
  687. * Examples:
  688. * <code>
  689. * echo Crawler::xpathLiteral('foo " bar');
  690. * //prints 'foo " bar'
  691. *
  692. * echo Crawler::xpathLiteral("foo ' bar");
  693. * //prints "foo ' bar"
  694. *
  695. * echo Crawler::xpathLiteral('a\'b"c');
  696. * //prints concat('a', "'", 'b"c')
  697. * </code>
  698. *
  699. * @param string $s String to be escaped
  700. *
  701. * @return string Converted string
  702. */
  703. public static function xpathLiteral($s)
  704. {
  705. if (false === strpos($s, "'")) {
  706. return sprintf("'%s'", $s);
  707. }
  708. if (false === strpos($s, '"')) {
  709. return sprintf('"%s"', $s);
  710. }
  711. $string = $s;
  712. $parts = array();
  713. while (true) {
  714. if (false !== $pos = strpos($string, "'")) {
  715. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  716. $parts[] = "\"'\"";
  717. $string = substr($string, $pos + 1);
  718. } else {
  719. $parts[] = "'$string'";
  720. break;
  721. }
  722. }
  723. return sprintf('concat(%s)', implode(', ', $parts));
  724. }
  725. /**
  726. * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0.
  727. */
  728. public function attach($object, $data = null)
  729. {
  730. $this->triggerDeprecation(__METHOD__);
  731. parent::attach($object, $data);
  732. }
  733. /**
  734. * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0.
  735. */
  736. public function detach($object)
  737. {
  738. $this->triggerDeprecation(__METHOD__);
  739. parent::detach($object);
  740. }
  741. /**
  742. * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0.
  743. */
  744. public function contains($object)
  745. {
  746. $this->triggerDeprecation(__METHOD__);
  747. return parent::contains($object);
  748. }
  749. /**
  750. * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0.
  751. */
  752. public function addAll($storage)
  753. {
  754. $this->triggerDeprecation(__METHOD__);
  755. parent::addAll($storage);
  756. }
  757. /**
  758. * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0.
  759. */
  760. public function removeAll($storage)
  761. {
  762. $this->triggerDeprecation(__METHOD__);
  763. parent::removeAll($storage);
  764. }
  765. /**
  766. * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0.
  767. */
  768. public function removeAllExcept($storage)
  769. {
  770. $this->triggerDeprecation(__METHOD__);
  771. parent::removeAllExcept($storage);
  772. }
  773. /**
  774. * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0.
  775. */
  776. public function getInfo()
  777. {
  778. $this->triggerDeprecation(__METHOD__);
  779. return parent::getInfo();
  780. }
  781. /**
  782. * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0.
  783. */
  784. public function setInfo($data)
  785. {
  786. $this->triggerDeprecation(__METHOD__);
  787. parent::setInfo($data);
  788. }
  789. /**
  790. * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0.
  791. */
  792. public function offsetExists($object)
  793. {
  794. $this->triggerDeprecation(__METHOD__);
  795. return parent::offsetExists($object);
  796. }
  797. /**
  798. * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0.
  799. */
  800. public function offsetSet($object, $data = null)
  801. {
  802. $this->triggerDeprecation(__METHOD__);
  803. parent::offsetSet($object, $data);
  804. }
  805. /**
  806. * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0.
  807. */
  808. public function offsetUnset($object)
  809. {
  810. $this->triggerDeprecation(__METHOD__);
  811. parent::offsetUnset($object);
  812. }
  813. /**
  814. * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0.
  815. */
  816. public function offsetGet($object)
  817. {
  818. $this->triggerDeprecation(__METHOD__);
  819. return parent::offsetGet($object);
  820. }
  821. /**
  822. * Filters the list of nodes with an XPath expression.
  823. *
  824. * The XPath expression should already be processed to apply it in the context of each node.
  825. *
  826. * @param string $xpath
  827. *
  828. * @return self
  829. */
  830. private function filterRelativeXPath($xpath)
  831. {
  832. $prefixes = $this->findNamespacePrefixes($xpath);
  833. $crawler = $this->createSubCrawler(null);
  834. foreach ($this as $node) {
  835. $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes);
  836. $crawler->add($domxpath->query($xpath, $node));
  837. }
  838. return $crawler;
  839. }
  840. /**
  841. * Make the XPath relative to the current context.
  842. *
  843. * The returned XPath will match elements matching the XPath inside the current crawler
  844. * when running in the context of a node of the crawler.
  845. *
  846. * @param string $xpath
  847. *
  848. * @return string
  849. */
  850. private function relativize($xpath)
  851. {
  852. $expressions = array();
  853. // An expression which will never match to replace expressions which cannot match in the crawler
  854. // We cannot simply drop
  855. $nonMatchingExpression = 'a[name() = "b"]';
  856. $xpathLen = strlen($xpath);
  857. $openedBrackets = 0;
  858. $startPosition = strspn($xpath, " \t\n\r\0\x0B");
  859. for ($i = $startPosition; $i <= $xpathLen; ++$i) {
  860. $i += strcspn($xpath, '"\'[]|', $i);
  861. if ($i < $xpathLen) {
  862. switch ($xpath[$i]) {
  863. case '"':
  864. case "'":
  865. if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) {
  866. return $xpath; // The XPath expression is invalid
  867. }
  868. continue 2;
  869. case '[':
  870. ++$openedBrackets;
  871. continue 2;
  872. case ']':
  873. --$openedBrackets;
  874. continue 2;
  875. }
  876. }
  877. if ($openedBrackets) {
  878. continue;
  879. }
  880. if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) {
  881. // If the union is inside some braces, we need to preserve the opening braces and apply
  882. // the change only inside it.
  883. $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1);
  884. $parenthesis = substr($xpath, $startPosition, $j);
  885. $startPosition += $j;
  886. } else {
  887. $parenthesis = '';
  888. }
  889. $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition));
  890. // BC for Symfony 2.4 and lower were elements were adding in a fake _root parent
  891. if (0 === strpos($expression, '/_root/')) {
  892. @trigger_error('XPath expressions referencing the fake root node are deprecated since version 2.8 and will be unsupported in 3.0. Please use "./" instead of "/_root/".', E_USER_DEPRECATED);
  893. $expression = './'.substr($expression, 7);
  894. } elseif (0 === strpos($expression, 'self::*/')) {
  895. $expression = './'.substr($expression, 8);
  896. }
  897. // add prefix before absolute element selector
  898. if ('' === $expression) {
  899. $expression = $nonMatchingExpression;
  900. } elseif (0 === strpos($expression, '//')) {
  901. $expression = 'descendant-or-self::'.substr($expression, 2);
  902. } elseif (0 === strpos($expression, './/')) {
  903. $expression = 'descendant-or-self::'.substr($expression, 3);
  904. } elseif (0 === strpos($expression, './')) {
  905. $expression = 'self::'.substr($expression, 2);
  906. } elseif (0 === strpos($expression, 'child::')) {
  907. $expression = 'self::'.substr($expression, 7);
  908. } elseif ('/' === $expression[0] || 0 === strpos($expression, 'self::')) {
  909. // the only direct child in Symfony 2.4 and lower is _root, which is already handled previously
  910. // so let's drop the expression entirely
  911. $expression = $nonMatchingExpression;
  912. } elseif ('.' === $expression[0]) {
  913. // '.' is the fake root element in Symfony 2.4 and lower, which is excluded from results
  914. $expression = $nonMatchingExpression;
  915. } elseif (0 === strpos($expression, 'descendant::')) {
  916. $expression = 'descendant-or-self::'.substr($expression, 12);
  917. } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) {
  918. // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes)
  919. $expression = $nonMatchingExpression;
  920. } elseif (0 !== strpos($expression, 'descendant-or-self::')) {
  921. $expression = 'self::'.$expression;
  922. }
  923. $expressions[] = $parenthesis.$expression;
  924. if ($i === $xpathLen) {
  925. return implode(' | ', $expressions);
  926. }
  927. $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1);
  928. $startPosition = $i + 1;
  929. }
  930. return $xpath; // The XPath expression is invalid
  931. }
  932. /**
  933. * @param int $position
  934. *
  935. * @return \DOMElement|null
  936. */
  937. public function getNode($position)
  938. {
  939. foreach ($this as $i => $node) {
  940. if ($i == $position) {
  941. return $node;
  942. }
  943. }
  944. }
  945. /**
  946. * @param \DOMElement $node
  947. * @param string $siblingDir
  948. *
  949. * @return array
  950. */
  951. protected function sibling($node, $siblingDir = 'nextSibling')
  952. {
  953. $nodes = array();
  954. do {
  955. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  956. $nodes[] = $node;
  957. }
  958. } while ($node = $node->$siblingDir);
  959. return $nodes;
  960. }
  961. /**
  962. * @param \DOMDocument $document
  963. * @param array $prefixes
  964. *
  965. * @return \DOMXPath
  966. *
  967. * @throws \InvalidArgumentException
  968. */
  969. private function createDOMXPath(\DOMDocument $document, array $prefixes = array())
  970. {
  971. $domxpath = new \DOMXPath($document);
  972. foreach ($prefixes as $prefix) {
  973. $namespace = $this->discoverNamespace($domxpath, $prefix);
  974. if (null !== $namespace) {
  975. $domxpath->registerNamespace($prefix, $namespace);
  976. }
  977. }
  978. return $domxpath;
  979. }
  980. /**
  981. * @param \DOMXPath $domxpath
  982. * @param string $prefix
  983. *
  984. * @return string
  985. *
  986. * @throws \InvalidArgumentException
  987. */
  988. private function discoverNamespace(\DOMXPath $domxpath, $prefix)
  989. {
  990. if (isset($this->namespaces[$prefix])) {
  991. return $this->namespaces[$prefix];
  992. }
  993. // ask for one namespace, otherwise we'd get a collection with an item for each node
  994. $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
  995. if ($node = $namespaces->item(0)) {
  996. return $node->nodeValue;
  997. }
  998. }
  999. /**
  1000. * @param string $xpath
  1001. *
  1002. * @return array
  1003. */
  1004. private function findNamespacePrefixes($xpath)
  1005. {
  1006. if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) {
  1007. return array_unique($matches['prefix']);
  1008. }
  1009. return array();
  1010. }
  1011. /**
  1012. * Creates a crawler for some subnodes.
  1013. *
  1014. * @param \DOMElement|\DOMElement[]|\DOMNodeList|null $nodes
  1015. *
  1016. * @return static
  1017. */
  1018. private function createSubCrawler($nodes)
  1019. {
  1020. $crawler = new static($nodes, $this->uri, $this->baseHref);
  1021. $crawler->isHtml = $this->isHtml;
  1022. $crawler->document = $this->document;
  1023. $crawler->namespaces = $this->namespaces;
  1024. return $crawler;
  1025. }
  1026. private function triggerDeprecation($methodName, $useTrace = false)
  1027. {
  1028. if ($useTrace || defined('HHVM_VERSION')) {
  1029. if (PHP_VERSION_ID >= 50400) {
  1030. $trace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 3);
  1031. } else {
  1032. $trace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS);
  1033. }
  1034. // The SplObjectStorage class performs calls to its own methods. These
  1035. // method calls must not lead to triggered deprecation notices.
  1036. if (isset($trace[2]['class']) && 'SplObjectStorage' === $trace[2]['class']) {
  1037. return;
  1038. }
  1039. }
  1040. @trigger_error('The '.$methodName.' method is deprecated since version 2.8 and will be removed in 3.0.', E_USER_DEPRECATED);
  1041. }
  1042. }