Source code for file /openid/Services/Yadis/ParseHTML.php
Documentation is available at ParseHTML.php
* This is the HTML pseudo-parser for the Yadis library.
* LICENSE: See the COPYING file included in this distribution.
* @copyright 2005 Janrain, Inc.
* @license http://www.gnu.org/copyleft/lesser.html LGPL
* This class is responsible for scanning an HTML string to find META
* tags and their attributes. This is used by the Yadis discovery
* process. This class must be instantiated to be used.
var $_tag_expr =
"<%s\b(?!:)([^>]*?)(?:\/>|>(.*?)(?:<\/?%s\s*>|\Z))";
var $_close_tag_expr =
"<\/?%s\s*>";
"<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>";
var $_attr_find =
'\b([-\w]+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)';
$this->_meta_find =
sprintf("/<meta\b(?!:)([^>]*)(?!<)>/%s",
$this->_removed_re =
sprintf("/%s/%s",
$this->_attr_find =
sprintf("/%s/%s",
$this->_entity_replacements =
array(
$this->_entity_replacements));
* Replace HTML entities (amp, lt, gt, and quot) as well as
* numeric entities (e.g. #x9f;) with their actual values and
* @param string $str The string in which to look for entities
* @return string $new_str The new string entities decoded
function replaceEntities($str)
foreach ($this->_entity_replacements as $old =>
$new) {
// Replace numeric entities because html_entity_decode doesn't
$str =
preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $str);
* Strip single and double quotes off of a string, if they are
* @param string $str The original string
* @return string $new_str The new string with leading and
* trailing quotes removed
function removeQuotes($str)
$single =
"/^\'(.*)\'$/";
* Create a regular expression that will match an opening (and
* optional) closing tag of a given name.
* @param string $tag_name The tag name to match
* @param array $close_tags An array of tag names which also
* constitute closing of the original tag
* @return string $regex A regular expression string to be used
function tagMatcher($tag_name, $close_tags =
null)
$closer =
sprintf("(?:%s)", $options);
$expr =
sprintf($this->_tag_expr, $tag_name, $closer);
return sprintf("/%s/%s", $expr, $this->_re_flags);
return $this->tagMatcher('html', array('body'));
return $this->tagMatcher('head', array('body'));
* Given an HTML document string, this finds all the META tags in
* the document, provided they are found in the
* <HTML><HEAD>...</HEAD> section of the document. The <HTML> tag
* @param string $html_string An HTMl document string
* @return array $tag_list Array of tags; each tag is an array of
function getMetaTags($html_string)
// Look for the closing body tag.
$body_closer =
sprintf($this->_close_tag_expr, 'body');
preg_match($body_closer, $html_string, $body_matches,
$html_string =
substr($html_string, 0, $body_matches[0][1]);
// Look for the opening body tag, and discard everything after
$body_re =
$this->tagMatcher('body');
preg_match($body_re, $html_string, $body_matches, PREG_OFFSET_CAPTURE);
$html_string =
substr($html_string, 0, $body_matches[0][1]);
// If an HTML tag is found at all, it must be in the right
// order; else, it may be missing (which is a case we allow
$html_re =
$this->tagMatcher('html', array('body'));
preg_match($html_re, $html_string, $html_matches);
$html =
$html_matches[0];
// Try to find the <HEAD> tag.
$head_re =
$this->headFind();
if (!preg_match($head_re, $html, $head_matches)) {
foreach ($link_matches[0] as $link) {
foreach ($attr_matches[0] as $index =>
$full_match) {
$name =
$attr_matches[1][$index];
$value =
$this->replaceEntities(
$this->removeQuotes($attr_matches[2][$index]));
$link_data[] =
$link_attrs;
* Looks for a META tag with an "http-equiv" attribute whose value
* is one of ("x-xrds-location", "x-yadis-location"), ignoring
* case. If such a META tag is found, its "content" attribute
* @param string $html_string An HTML document in string format
* @return mixed $content The "content" attribute value of the
* META tag, if found, or null if no such tag was found.
$meta_tags =
$this->getMetaTags($html_string);
foreach ($meta_tags as $tag) {
array('x-xrds-location', 'x-yadis-location'))) &&