Crawler.php 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Symfony\Component\CssSelector\CssSelectorConverter;
  12. /**
  13. * Crawler eases navigation of a list of \DOMNode objects.
  14. *
  15. * @author Fabien Potencier <fabien@symfony.com>
  16. */
  17. class Crawler implements \Countable, \IteratorAggregate
  18. {
  19. /**
  20. * @var string The current URI
  21. */
  22. protected $uri;
  23. /**
  24. * @var string The default namespace prefix to be used with XPath and CSS expressions
  25. */
  26. private $defaultNamespacePrefix = 'default';
  27. /**
  28. * @var array A map of manually registered namespaces
  29. */
  30. private $namespaces = array();
  31. /**
  32. * @var string The base href value
  33. */
  34. private $baseHref;
  35. /**
  36. * @var \DOMDocument|null
  37. */
  38. private $document;
  39. /**
  40. * @var \DOMElement[]
  41. */
  42. private $nodes = array();
  43. /**
  44. * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath).
  45. *
  46. * @var bool
  47. */
  48. private $isHtml = true;
  49. /**
  50. * @param mixed $node A Node to use as the base for the crawling
  51. * @param string $currentUri The current URI
  52. * @param string $baseHref The base href value
  53. */
  54. public function __construct($node = null, $currentUri = null, $baseHref = null)
  55. {
  56. $this->uri = $currentUri;
  57. $this->baseHref = $baseHref ?: $currentUri;
  58. $this->add($node);
  59. }
  60. /**
  61. * Returns the current URI.
  62. *
  63. * @return string
  64. */
  65. public function getUri()
  66. {
  67. return $this->uri;
  68. }
  69. /**
  70. * Returns base href.
  71. *
  72. * @return string
  73. */
  74. public function getBaseHref()
  75. {
  76. return $this->baseHref;
  77. }
  78. /**
  79. * Removes all the nodes.
  80. */
  81. public function clear()
  82. {
  83. $this->nodes = array();
  84. $this->document = null;
  85. }
  86. /**
  87. * Adds a node to the current list of nodes.
  88. *
  89. * This method uses the appropriate specialized add*() method based
  90. * on the type of the argument.
  91. *
  92. * @param \DOMNodeList|\DOMNode|array|string|null $node A node
  93. *
  94. * @throws \InvalidArgumentException When node is not the expected type.
  95. */
  96. public function add($node)
  97. {
  98. if ($node instanceof \DOMNodeList) {
  99. $this->addNodeList($node);
  100. } elseif ($node instanceof \DOMNode) {
  101. $this->addNode($node);
  102. } elseif (is_array($node)) {
  103. $this->addNodes($node);
  104. } elseif (is_string($node)) {
  105. $this->addContent($node);
  106. } elseif (null !== $node) {
  107. throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', is_object($node) ? get_class($node) : gettype($node)));
  108. }
  109. }
  110. /**
  111. * Adds HTML/XML content.
  112. *
  113. * If the charset is not set via the content type, it is assumed
  114. * to be ISO-8859-1, which is the default charset defined by the
  115. * HTTP 1.1 specification.
  116. *
  117. * @param string $content A string to parse as HTML/XML
  118. * @param null|string $type The content type of the string
  119. */
  120. public function addContent($content, $type = null)
  121. {
  122. if (empty($type)) {
  123. $type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html';
  124. }
  125. // DOM only for HTML/XML content
  126. if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
  127. return;
  128. }
  129. $charset = null;
  130. if (false !== $pos = stripos($type, 'charset=')) {
  131. $charset = substr($type, $pos + 8);
  132. if (false !== $pos = strpos($charset, ';')) {
  133. $charset = substr($charset, 0, $pos);
  134. }
  135. }
  136. // http://www.w3.org/TR/encoding/#encodings
  137. // http://www.w3.org/TR/REC-xml/#NT-EncName
  138. if (null === $charset &&
  139. preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) {
  140. $charset = $matches[1];
  141. }
  142. if (null === $charset) {
  143. $charset = 'ISO-8859-1';
  144. }
  145. if ('x' === $xmlMatches[1]) {
  146. $this->addXmlContent($content, $charset);
  147. } else {
  148. $this->addHtmlContent($content, $charset);
  149. }
  150. }
  151. /**
  152. * Adds an HTML content to the list of nodes.
  153. *
  154. * The libxml errors are disabled when the content is parsed.
  155. *
  156. * If you want to get parsing errors, be sure to enable
  157. * internal errors via libxml_use_internal_errors(true)
  158. * and then, get the errors via libxml_get_errors(). Be
  159. * sure to clear errors with libxml_clear_errors() afterward.
  160. *
  161. * @param string $content The HTML content
  162. * @param string $charset The charset
  163. */
  164. public function addHtmlContent($content, $charset = 'UTF-8')
  165. {
  166. $internalErrors = libxml_use_internal_errors(true);
  167. $disableEntities = libxml_disable_entity_loader(true);
  168. $dom = new \DOMDocument('1.0', $charset);
  169. $dom->validateOnParse = true;
  170. set_error_handler(function () {throw new \Exception();});
  171. try {
  172. // Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML()
  173. $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
  174. } catch (\Exception $e) {
  175. }
  176. restore_error_handler();
  177. if ('' !== trim($content)) {
  178. @$dom->loadHTML($content);
  179. }
  180. libxml_use_internal_errors($internalErrors);
  181. libxml_disable_entity_loader($disableEntities);
  182. $this->addDocument($dom);
  183. $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href'));
  184. $baseHref = current($base);
  185. if (count($base) && !empty($baseHref)) {
  186. if ($this->baseHref) {
  187. $linkNode = $dom->createElement('a');
  188. $linkNode->setAttribute('href', $baseHref);
  189. $link = new Link($linkNode, $this->baseHref);
  190. $this->baseHref = $link->getUri();
  191. } else {
  192. $this->baseHref = $baseHref;
  193. }
  194. }
  195. }
  196. /**
  197. * Adds an XML content to the list of nodes.
  198. *
  199. * The libxml errors are disabled when the content is parsed.
  200. *
  201. * If you want to get parsing errors, be sure to enable
  202. * internal errors via libxml_use_internal_errors(true)
  203. * and then, get the errors via libxml_get_errors(). Be
  204. * sure to clear errors with libxml_clear_errors() afterward.
  205. *
  206. * @param string $content The XML content
  207. * @param string $charset The charset
  208. * @param int $options Bitwise OR of the libxml option constants
  209. * LIBXML_PARSEHUGE is dangerous, see
  210. * http://symfony.com/blog/security-release-symfony-2-0-17-released
  211. */
  212. public function addXmlContent($content, $charset = 'UTF-8', $options = LIBXML_NONET)
  213. {
  214. // remove the default namespace if it's the only namespace to make XPath expressions simpler
  215. if (!preg_match('/xmlns:/', $content)) {
  216. $content = str_replace('xmlns', 'ns', $content);
  217. }
  218. $internalErrors = libxml_use_internal_errors(true);
  219. $disableEntities = libxml_disable_entity_loader(true);
  220. $dom = new \DOMDocument('1.0', $charset);
  221. $dom->validateOnParse = true;
  222. if ('' !== trim($content)) {
  223. @$dom->loadXML($content, $options);
  224. }
  225. libxml_use_internal_errors($internalErrors);
  226. libxml_disable_entity_loader($disableEntities);
  227. $this->addDocument($dom);
  228. $this->isHtml = false;
  229. }
  230. /**
  231. * Adds a \DOMDocument to the list of nodes.
  232. *
  233. * @param \DOMDocument $dom A \DOMDocument instance
  234. */
  235. public function addDocument(\DOMDocument $dom)
  236. {
  237. if ($dom->documentElement) {
  238. $this->addNode($dom->documentElement);
  239. }
  240. }
  241. /**
  242. * Adds a \DOMNodeList to the list of nodes.
  243. *
  244. * @param \DOMNodeList $nodes A \DOMNodeList instance
  245. */
  246. public function addNodeList(\DOMNodeList $nodes)
  247. {
  248. foreach ($nodes as $node) {
  249. if ($node instanceof \DOMNode) {
  250. $this->addNode($node);
  251. }
  252. }
  253. }
  254. /**
  255. * Adds an array of \DOMNode instances to the list of nodes.
  256. *
  257. * @param \DOMNode[] $nodes An array of \DOMNode instances
  258. */
  259. public function addNodes(array $nodes)
  260. {
  261. foreach ($nodes as $node) {
  262. $this->add($node);
  263. }
  264. }
  265. /**
  266. * Adds a \DOMNode instance to the list of nodes.
  267. *
  268. * @param \DOMNode $node A \DOMNode instance
  269. */
  270. public function addNode(\DOMNode $node)
  271. {
  272. if ($node instanceof \DOMDocument) {
  273. $node = $node->documentElement;
  274. }
  275. if (null !== $this->document && $this->document !== $node->ownerDocument) {
  276. throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.');
  277. }
  278. if (null === $this->document) {
  279. $this->document = $node->ownerDocument;
  280. }
  281. // Don't add duplicate nodes in the Crawler
  282. if (in_array($node, $this->nodes, true)) {
  283. return;
  284. }
  285. $this->nodes[] = $node;
  286. }
  287. /**
  288. * Returns a node given its position in the node list.
  289. *
  290. * @param int $position The position
  291. *
  292. * @return self
  293. */
  294. public function eq($position)
  295. {
  296. if (isset($this->nodes[$position])) {
  297. return $this->createSubCrawler($this->nodes[$position]);
  298. }
  299. return $this->createSubCrawler(null);
  300. }
  301. /**
  302. * Calls an anonymous function on each node of the list.
  303. *
  304. * The anonymous function receives the position and the node wrapped
  305. * in a Crawler instance as arguments.
  306. *
  307. * Example:
  308. *
  309. * $crawler->filter('h1')->each(function ($node, $i) {
  310. * return $node->text();
  311. * });
  312. *
  313. * @param \Closure $closure An anonymous function
  314. *
  315. * @return array An array of values returned by the anonymous function
  316. */
  317. public function each(\Closure $closure)
  318. {
  319. $data = array();
  320. foreach ($this->nodes as $i => $node) {
  321. $data[] = $closure($this->createSubCrawler($node), $i);
  322. }
  323. return $data;
  324. }
  325. /**
  326. * Slices the list of nodes by $offset and $length.
  327. *
  328. * @param int $offset
  329. * @param int $length
  330. *
  331. * @return self
  332. */
  333. public function slice($offset = 0, $length = null)
  334. {
  335. return $this->createSubCrawler(array_slice($this->nodes, $offset, $length));
  336. }
  337. /**
  338. * Reduces the list of nodes by calling an anonymous function.
  339. *
  340. * To remove a node from the list, the anonymous function must return false.
  341. *
  342. * @param \Closure $closure An anonymous function
  343. *
  344. * @return self
  345. */
  346. public function reduce(\Closure $closure)
  347. {
  348. $nodes = array();
  349. foreach ($this->nodes as $i => $node) {
  350. if (false !== $closure($this->createSubCrawler($node), $i)) {
  351. $nodes[] = $node;
  352. }
  353. }
  354. return $this->createSubCrawler($nodes);
  355. }
  356. /**
  357. * Returns the first node of the current selection.
  358. *
  359. * @return self
  360. */
  361. public function first()
  362. {
  363. return $this->eq(0);
  364. }
  365. /**
  366. * Returns the last node of the current selection.
  367. *
  368. * @return self
  369. */
  370. public function last()
  371. {
  372. return $this->eq(count($this->nodes) - 1);
  373. }
  374. /**
  375. * Returns the siblings nodes of the current selection.
  376. *
  377. * @return self
  378. *
  379. * @throws \InvalidArgumentException When current node is empty
  380. */
  381. public function siblings()
  382. {
  383. if (!$this->nodes) {
  384. throw new \InvalidArgumentException('The current node list is empty.');
  385. }
  386. return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild));
  387. }
  388. /**
  389. * Returns the next siblings nodes of the current selection.
  390. *
  391. * @return self
  392. *
  393. * @throws \InvalidArgumentException When current node is empty
  394. */
  395. public function nextAll()
  396. {
  397. if (!$this->nodes) {
  398. throw new \InvalidArgumentException('The current node list is empty.');
  399. }
  400. return $this->createSubCrawler($this->sibling($this->getNode(0)));
  401. }
  402. /**
  403. * Returns the previous sibling nodes of the current selection.
  404. *
  405. * @return self
  406. *
  407. * @throws \InvalidArgumentException
  408. */
  409. public function previousAll()
  410. {
  411. if (!$this->nodes) {
  412. throw new \InvalidArgumentException('The current node list is empty.');
  413. }
  414. return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling'));
  415. }
  416. /**
  417. * Returns the parents nodes of the current selection.
  418. *
  419. * @return self
  420. *
  421. * @throws \InvalidArgumentException When current node is empty
  422. */
  423. public function parents()
  424. {
  425. if (!$this->nodes) {
  426. throw new \InvalidArgumentException('The current node list is empty.');
  427. }
  428. $node = $this->getNode(0);
  429. $nodes = array();
  430. while ($node = $node->parentNode) {
  431. if (XML_ELEMENT_NODE === $node->nodeType) {
  432. $nodes[] = $node;
  433. }
  434. }
  435. return $this->createSubCrawler($nodes);
  436. }
  437. /**
  438. * Returns the children nodes of the current selection.
  439. *
  440. * @return self
  441. *
  442. * @throws \InvalidArgumentException When current node is empty
  443. */
  444. public function children()
  445. {
  446. if (!$this->nodes) {
  447. throw new \InvalidArgumentException('The current node list is empty.');
  448. }
  449. $node = $this->getNode(0)->firstChild;
  450. return $this->createSubCrawler($node ? $this->sibling($node) : array());
  451. }
  452. /**
  453. * Returns the attribute value of the first node of the list.
  454. *
  455. * @param string $attribute The attribute name
  456. *
  457. * @return string|null The attribute value or null if the attribute does not exist
  458. *
  459. * @throws \InvalidArgumentException When current node is empty
  460. */
  461. public function attr($attribute)
  462. {
  463. if (!$this->nodes) {
  464. throw new \InvalidArgumentException('The current node list is empty.');
  465. }
  466. $node = $this->getNode(0);
  467. return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null;
  468. }
  469. /**
  470. * Returns the node name of the first node of the list.
  471. *
  472. * @return string The node name
  473. *
  474. * @throws \InvalidArgumentException When current node is empty
  475. */
  476. public function nodeName()
  477. {
  478. if (!$this->nodes) {
  479. throw new \InvalidArgumentException('The current node list is empty.');
  480. }
  481. return $this->getNode(0)->nodeName;
  482. }
  483. /**
  484. * Returns the node value of the first node of the list.
  485. *
  486. * @return string The node value
  487. *
  488. * @throws \InvalidArgumentException When current node is empty
  489. */
  490. public function text()
  491. {
  492. if (!$this->nodes) {
  493. throw new \InvalidArgumentException('The current node list is empty.');
  494. }
  495. return $this->getNode(0)->nodeValue;
  496. }
  497. /**
  498. * Returns the first node of the list as HTML.
  499. *
  500. * @return string The node html
  501. *
  502. * @throws \InvalidArgumentException When current node is empty
  503. */
  504. public function html()
  505. {
  506. if (!$this->nodes) {
  507. throw new \InvalidArgumentException('The current node list is empty.');
  508. }
  509. $html = '';
  510. foreach ($this->getNode(0)->childNodes as $child) {
  511. $html .= $child->ownerDocument->saveHTML($child);
  512. }
  513. return $html;
  514. }
  515. /**
  516. * Evaluates an XPath expression.
  517. *
  518. * Since an XPath expression might evaluate to either a simple type or a \DOMNodeList,
  519. * this method will return either an array of simple types or a new Crawler instance.
  520. *
  521. * @param string $xpath An XPath expression
  522. *
  523. * @return array|Crawler An array of evaluation results or a new Crawler instance
  524. */
  525. public function evaluate($xpath)
  526. {
  527. if (null === $this->document) {
  528. throw new \LogicException('Cannot evaluate the expression on an uninitialized crawler.');
  529. }
  530. $data = array();
  531. $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath));
  532. foreach ($this->nodes as $node) {
  533. $data[] = $domxpath->evaluate($xpath, $node);
  534. }
  535. if (isset($data[0]) && $data[0] instanceof \DOMNodeList) {
  536. return $this->createSubCrawler($data);
  537. }
  538. return $data;
  539. }
  540. /**
  541. * Extracts information from the list of nodes.
  542. *
  543. * You can extract attributes or/and the node value (_text).
  544. *
  545. * Example:
  546. *
  547. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  548. *
  549. * @param array $attributes An array of attributes
  550. *
  551. * @return array An array of extracted values
  552. */
  553. public function extract($attributes)
  554. {
  555. $attributes = (array) $attributes;
  556. $count = count($attributes);
  557. $data = array();
  558. foreach ($this->nodes as $node) {
  559. $elements = array();
  560. foreach ($attributes as $attribute) {
  561. if ('_text' === $attribute) {
  562. $elements[] = $node->nodeValue;
  563. } else {
  564. $elements[] = $node->getAttribute($attribute);
  565. }
  566. }
  567. $data[] = $count > 1 ? $elements : $elements[0];
  568. }
  569. return $data;
  570. }
  571. /**
  572. * Filters the list of nodes with an XPath expression.
  573. *
  574. * The XPath expression is evaluated in the context of the crawler, which
  575. * is considered as a fake parent of the elements inside it.
  576. * This means that a child selector "div" or "./div" will match only
  577. * the div elements of the current crawler, not their children.
  578. *
  579. * @param string $xpath An XPath expression
  580. *
  581. * @return self
  582. */
  583. public function filterXPath($xpath)
  584. {
  585. $xpath = $this->relativize($xpath);
  586. // If we dropped all expressions in the XPath while preparing it, there would be no match
  587. if ('' === $xpath) {
  588. return $this->createSubCrawler(null);
  589. }
  590. return $this->filterRelativeXPath($xpath);
  591. }
  592. /**
  593. * Filters the list of nodes with a CSS selector.
  594. *
  595. * This method only works if you have installed the CssSelector Symfony Component.
  596. *
  597. * @param string $selector A CSS selector
  598. *
  599. * @return self
  600. *
  601. * @throws \RuntimeException if the CssSelector Component is not available
  602. */
  603. public function filter($selector)
  604. {
  605. if (!class_exists('Symfony\\Component\\CssSelector\\CssSelectorConverter')) {
  606. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector 2.8+ is not installed (you can use filterXPath instead).');
  607. }
  608. $converter = new CssSelectorConverter($this->isHtml);
  609. // The CssSelector already prefixes the selector with descendant-or-self::
  610. return $this->filterRelativeXPath($converter->toXPath($selector));
  611. }
  612. /**
  613. * Selects links by name or alt value for clickable images.
  614. *
  615. * @param string $value The link text
  616. *
  617. * @return self
  618. */
  619. public function selectLink($value)
  620. {
  621. $xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')).
  622. sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' '));
  623. return $this->filterRelativeXPath($xpath);
  624. }
  625. /**
  626. * Selects images by alt value.
  627. *
  628. * @param string $value The image alt
  629. *
  630. * @return self A new instance of Crawler with the filtered list of nodes
  631. */
  632. public function selectImage($value)
  633. {
  634. $xpath = sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value));
  635. return $this->filterRelativeXPath($xpath);
  636. }
  637. /**
  638. * Selects a button by name or alt value for images.
  639. *
  640. * @param string $value The button text
  641. *
  642. * @return self
  643. */
  644. public function selectButton($value)
  645. {
  646. $translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")';
  647. $xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
  648. sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id=%s or @name=%s] ', $translate, static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)).
  649. sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id=%s or @name=%s]', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value));
  650. return $this->filterRelativeXPath($xpath);
  651. }
  652. /**
  653. * Returns a Link object for the first node in the list.
  654. *
  655. * @param string $method The method for the link (get by default)
  656. *
  657. * @return Link A Link instance
  658. *
  659. * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
  660. */
  661. public function link($method = 'get')
  662. {
  663. if (!$this->nodes) {
  664. throw new \InvalidArgumentException('The current node list is empty.');
  665. }
  666. $node = $this->getNode(0);
  667. if (!$node instanceof \DOMElement) {
  668. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node)));
  669. }
  670. return new Link($node, $this->baseHref, $method);
  671. }
  672. /**
  673. * Returns an array of Link objects for the nodes in the list.
  674. *
  675. * @return Link[] An array of Link instances
  676. *
  677. * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances
  678. */
  679. public function links()
  680. {
  681. $links = array();
  682. foreach ($this->nodes as $node) {
  683. if (!$node instanceof \DOMElement) {
  684. throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node)));
  685. }
  686. $links[] = new Link($node, $this->baseHref, 'get');
  687. }
  688. return $links;
  689. }
  690. /**
  691. * Returns an Image object for the first node in the list.
  692. *
  693. * @return Image An Image instance
  694. *
  695. * @throws \InvalidArgumentException If the current node list is empty
  696. */
  697. public function image()
  698. {
  699. if (!count($this)) {
  700. throw new \InvalidArgumentException('The current node list is empty.');
  701. }
  702. $node = $this->getNode(0);
  703. if (!$node instanceof \DOMElement) {
  704. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node)));
  705. }
  706. return new Image($node, $this->baseHref);
  707. }
  708. /**
  709. * Returns an array of Image objects for the nodes in the list.
  710. *
  711. * @return Image[] An array of Image instances
  712. */
  713. public function images()
  714. {
  715. $images = array();
  716. foreach ($this as $node) {
  717. if (!$node instanceof \DOMElement) {
  718. throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node)));
  719. }
  720. $images[] = new Image($node, $this->baseHref);
  721. }
  722. return $images;
  723. }
  724. /**
  725. * Returns a Form object for the first node in the list.
  726. *
  727. * @param array $values An array of values for the form fields
  728. * @param string $method The method for the form
  729. *
  730. * @return Form A Form instance
  731. *
  732. * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
  733. */
  734. public function form(array $values = null, $method = null)
  735. {
  736. if (!$this->nodes) {
  737. throw new \InvalidArgumentException('The current node list is empty.');
  738. }
  739. $node = $this->getNode(0);
  740. if (!$node instanceof \DOMElement) {
  741. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node)));
  742. }
  743. $form = new Form($node, $this->uri, $method, $this->baseHref);
  744. if (null !== $values) {
  745. $form->setValues($values);
  746. }
  747. return $form;
  748. }
  749. /**
  750. * Overloads a default namespace prefix to be used with XPath and CSS expressions.
  751. *
  752. * @param string $prefix
  753. */
  754. public function setDefaultNamespacePrefix($prefix)
  755. {
  756. $this->defaultNamespacePrefix = $prefix;
  757. }
  758. /**
  759. * @param string $prefix
  760. * @param string $namespace
  761. */
  762. public function registerNamespace($prefix, $namespace)
  763. {
  764. $this->namespaces[$prefix] = $namespace;
  765. }
  766. /**
  767. * Converts string for XPath expressions.
  768. *
  769. * Escaped characters are: quotes (") and apostrophe (').
  770. *
  771. * Examples:
  772. * <code>
  773. * echo Crawler::xpathLiteral('foo " bar');
  774. * //prints 'foo " bar'
  775. *
  776. * echo Crawler::xpathLiteral("foo ' bar");
  777. * //prints "foo ' bar"
  778. *
  779. * echo Crawler::xpathLiteral('a\'b"c');
  780. * //prints concat('a', "'", 'b"c')
  781. * </code>
  782. *
  783. * @param string $s String to be escaped
  784. *
  785. * @return string Converted string
  786. */
  787. public static function xpathLiteral($s)
  788. {
  789. if (false === strpos($s, "'")) {
  790. return sprintf("'%s'", $s);
  791. }
  792. if (false === strpos($s, '"')) {
  793. return sprintf('"%s"', $s);
  794. }
  795. $string = $s;
  796. $parts = array();
  797. while (true) {
  798. if (false !== $pos = strpos($string, "'")) {
  799. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  800. $parts[] = "\"'\"";
  801. $string = substr($string, $pos + 1);
  802. } else {
  803. $parts[] = "'$string'";
  804. break;
  805. }
  806. }
  807. return sprintf('concat(%s)', implode(', ', $parts));
  808. }
  809. /**
  810. * Filters the list of nodes with an XPath expression.
  811. *
  812. * The XPath expression should already be processed to apply it in the context of each node.
  813. *
  814. * @param string $xpath
  815. *
  816. * @return self
  817. */
  818. private function filterRelativeXPath($xpath)
  819. {
  820. $prefixes = $this->findNamespacePrefixes($xpath);
  821. $crawler = $this->createSubCrawler(null);
  822. foreach ($this->nodes as $node) {
  823. $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes);
  824. $crawler->add($domxpath->query($xpath, $node));
  825. }
  826. return $crawler;
  827. }
  828. /**
  829. * Make the XPath relative to the current context.
  830. *
  831. * The returned XPath will match elements matching the XPath inside the current crawler
  832. * when running in the context of a node of the crawler.
  833. *
  834. * @param string $xpath
  835. *
  836. * @return string
  837. */
  838. private function relativize($xpath)
  839. {
  840. $expressions = array();
  841. // An expression which will never match to replace expressions which cannot match in the crawler
  842. // We cannot simply drop
  843. $nonMatchingExpression = 'a[name() = "b"]';
  844. $xpathLen = strlen($xpath);
  845. $openedBrackets = 0;
  846. $startPosition = strspn($xpath, " \t\n\r\0\x0B");
  847. for ($i = $startPosition; $i <= $xpathLen; ++$i) {
  848. $i += strcspn($xpath, '"\'[]|', $i);
  849. if ($i < $xpathLen) {
  850. switch ($xpath[$i]) {
  851. case '"':
  852. case "'":
  853. if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) {
  854. return $xpath; // The XPath expression is invalid
  855. }
  856. continue 2;
  857. case '[':
  858. ++$openedBrackets;
  859. continue 2;
  860. case ']':
  861. --$openedBrackets;
  862. continue 2;
  863. }
  864. }
  865. if ($openedBrackets) {
  866. continue;
  867. }
  868. if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) {
  869. // If the union is inside some braces, we need to preserve the opening braces and apply
  870. // the change only inside it.
  871. $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1);
  872. $parenthesis = substr($xpath, $startPosition, $j);
  873. $startPosition += $j;
  874. } else {
  875. $parenthesis = '';
  876. }
  877. $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition));
  878. if (0 === strpos($expression, 'self::*/')) {
  879. $expression = './'.substr($expression, 8);
  880. }
  881. // add prefix before absolute element selector
  882. if ('' === $expression) {
  883. $expression = $nonMatchingExpression;
  884. } elseif (0 === strpos($expression, '//')) {
  885. $expression = 'descendant-or-self::'.substr($expression, 2);
  886. } elseif (0 === strpos($expression, './/')) {
  887. $expression = 'descendant-or-self::'.substr($expression, 3);
  888. } elseif (0 === strpos($expression, './')) {
  889. $expression = 'self::'.substr($expression, 2);
  890. } elseif (0 === strpos($expression, 'child::')) {
  891. $expression = 'self::'.substr($expression, 7);
  892. } elseif ('/' === $expression[0] || '.' === $expression[0] || 0 === strpos($expression, 'self::')) {
  893. $expression = $nonMatchingExpression;
  894. } elseif (0 === strpos($expression, 'descendant::')) {
  895. $expression = 'descendant-or-self::'.substr($expression, 12);
  896. } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) {
  897. // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes)
  898. $expression = $nonMatchingExpression;
  899. } elseif (0 !== strpos($expression, 'descendant-or-self::')) {
  900. $expression = 'self::'.$expression;
  901. }
  902. $expressions[] = $parenthesis.$expression;
  903. if ($i === $xpathLen) {
  904. return implode(' | ', $expressions);
  905. }
  906. $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1);
  907. $startPosition = $i + 1;
  908. }
  909. return $xpath; // The XPath expression is invalid
  910. }
  911. /**
  912. * @param int $position
  913. *
  914. * @return \DOMElement|null
  915. */
  916. public function getNode($position)
  917. {
  918. if (isset($this->nodes[$position])) {
  919. return $this->nodes[$position];
  920. }
  921. }
  922. /**
  923. * @return int
  924. */
  925. public function count()
  926. {
  927. return count($this->nodes);
  928. }
  929. /**
  930. * @return \ArrayIterator
  931. */
  932. public function getIterator()
  933. {
  934. return new \ArrayIterator($this->nodes);
  935. }
  936. /**
  937. * @param \DOMElement $node
  938. * @param string $siblingDir
  939. *
  940. * @return array
  941. */
  942. protected function sibling($node, $siblingDir = 'nextSibling')
  943. {
  944. $nodes = array();
  945. do {
  946. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  947. $nodes[] = $node;
  948. }
  949. } while ($node = $node->$siblingDir);
  950. return $nodes;
  951. }
  952. /**
  953. * @param \DOMDocument $document
  954. * @param array $prefixes
  955. *
  956. * @return \DOMXPath
  957. *
  958. * @throws \InvalidArgumentException
  959. */
  960. private function createDOMXPath(\DOMDocument $document, array $prefixes = array())
  961. {
  962. $domxpath = new \DOMXPath($document);
  963. foreach ($prefixes as $prefix) {
  964. $namespace = $this->discoverNamespace($domxpath, $prefix);
  965. if (null !== $namespace) {
  966. $domxpath->registerNamespace($prefix, $namespace);
  967. }
  968. }
  969. return $domxpath;
  970. }
  971. /**
  972. * @param \DOMXPath $domxpath
  973. * @param string $prefix
  974. *
  975. * @return string
  976. *
  977. * @throws \InvalidArgumentException
  978. */
  979. private function discoverNamespace(\DOMXPath $domxpath, $prefix)
  980. {
  981. if (isset($this->namespaces[$prefix])) {
  982. return $this->namespaces[$prefix];
  983. }
  984. // ask for one namespace, otherwise we'd get a collection with an item for each node
  985. $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
  986. if ($node = $namespaces->item(0)) {
  987. return $node->nodeValue;
  988. }
  989. }
  990. /**
  991. * @param string $xpath
  992. *
  993. * @return array
  994. */
  995. private function findNamespacePrefixes($xpath)
  996. {
  997. if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) {
  998. return array_unique($matches['prefix']);
  999. }
  1000. return array();
  1001. }
  1002. /**
  1003. * Creates a crawler for some subnodes.
  1004. *
  1005. * @param \DOMElement|\DOMElement[]|\DOMNodeList|null $nodes
  1006. *
  1007. * @return static
  1008. */
  1009. private function createSubCrawler($nodes)
  1010. {
  1011. $crawler = new static($nodes, $this->uri, $this->baseHref);
  1012. $crawler->isHtml = $this->isHtml;
  1013. $crawler->document = $this->document;
  1014. $crawler->namespaces = $this->namespaces;
  1015. return $crawler;
  1016. }
  1017. }