|
eZ Publish
[4.0]
|
00001 <?php 00002 // 00003 // $Id$ 00004 // 00005 // Definition of eZXML class 00006 // 00007 // Created on: <13-Feb-2002 09:15:42 bf> 00008 // 00009 // ## BEGIN COPYRIGHT, LICENSE AND WARRANTY NOTICE ## 00010 // SOFTWARE NAME: eZ Publish 00011 // SOFTWARE RELEASE: 4.0.x 00012 // COPYRIGHT NOTICE: Copyright (C) 1999-2008 eZ Systems AS 00013 // SOFTWARE LICENSE: GNU General Public License v2.0 00014 // NOTICE: > 00015 // This program is free software; you can redistribute it and/or 00016 // modify it under the terms of version 2.0 of the GNU General 00017 // Public License as published by the Free Software Foundation. 00018 // 00019 // This program is distributed in the hope that it will be useful, 00020 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00021 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00022 // GNU General Public License for more details. 00023 // 00024 // You should have received a copy of version 2.0 of the GNU General 00025 // Public License along with this program; if not, write to the Free 00026 // Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00027 // MA 02110-1301, USA. 00028 // 00029 // 00030 // ## END COPYRIGHT, LICENSE AND WARRANTY NOTICE ## 00031 // 00032 00033 /*! \file ezxml.php 00034 XML DOM parser. 00035 */ 00036 00037 /*! \defgroup eZXML XML parser and DOM library */ 00038 00039 /*! 00040 \class eZXML ezxml.php 00041 \ingroup eZXML 00042 \brief eZXML handles parsing of well formed XML documents. 00043 00044 1 eZXML will create a DOM tree from well formed XML documents. 00045 00046 \sa eZDOMDocument eZDOMNode 00047 */ 00048 00049 require_once( "lib/ezutils/classes/ezdebug.php" ); 00050 //include_once( "lib/ezxml/classes/ezdomnode.php" ); 00051 //include_once( "lib/ezxml/classes/ezdomdocument.php" ); 00052 00053 class eZXML 00054 { 00055 /*! 00056 Constructor 00057 */ 00058 function eZXML( ) 00059 { 00060 00061 } 00062 00063 /*! 00064 Will return a DOM object tree from the well formed XML. 00065 00066 $params["SetParentNode"] = false/true : create eZDOMDocument with setParentNode parameter set to true or false. 00067 $params["TrimWhiteSpace"] = false/true : should the XML parser ignore whitespaces between tags. 00068 $params["CharsetConversion"] = false/true : Whether charset conversion is done or not, default is true. 00069 $params["ConvertSpecialChars"] = false/true: whether to convert < > & etc into < > &; default is true. 00070 */ 00071 function domTree( $xmlDoc, $params = array(), $native = false ) 00072 { 00073 if ( !$xmlDoc ) 00074 { 00075 $tmp = null; 00076 return $tmp; 00077 } 00078 00079 /* We remove all control chars from the text, although they 00080 * should have not be there in the first place. This is 00081 * iso-8859-1 and UTF-8 safe. Those characters might also never exist 00082 * in an XML document in the first place 00083 * (http://w3.org/TR/2004/REC-xml-20040204/#NT-Char) so it's safe to 00084 * remove them */ 00085 $xmlDoc = preg_replace('/[\x00-\x08\x0b-\x0c\x0e-\x1f]/', '', $xmlDoc); 00086 00087 if ( $native and function_exists( 'domxml_open_mem' ) ) 00088 { 00089 $domDocument = domxml_open_mem( $xmlDoc ); 00090 return $domDocument; 00091 } 00092 00093 if ( !isset( $params["TrimWhiteSpace"] ) ) 00094 $params["TrimWhiteSpace"] = true; 00095 00096 if ( !isset( $params["SetParentNode"] ) ) 00097 $params["SetParentNode"] = false; 00098 00099 $schema = false; 00100 if ( isset( $params["Schema"] ) && get_class( $params["Schema"] ) == "ezschema" ) 00101 { 00102 $schema = $params["Schema"]; 00103 } 00104 $charset = 'UTF-8'; 00105 if ( isset( $params['CharsetConversion'] ) and 00106 !$params['CharsetConversion'] ) 00107 $charset = false; 00108 if ( !isset( $params['ConvertSpecialChars'] ) ) 00109 { 00110 $params['ConvertSpecialChars'] = true; 00111 } 00112 00113 $TagStack = array(); 00114 00115 $xmlAttributes = array(); 00116 00117 // strip header 00118 if ( preg_match( "#<\?xml(.*?)\?>#", $xmlDoc, $matches ) ) 00119 { 00120 $xmlAttributeText = $matches[1]; 00121 $xmlAttributes = $this->parseAttributes( $xmlAttributeText ); 00122 for ( $i = 0; $i < count( $xmlAttributes ); ++$i ) 00123 { 00124 $xmlAttribute =& $xmlAttributes[$i]; 00125 if ( $xmlAttribute->name() == 'encoding' ) 00126 $charset = $xmlAttribute->content(); 00127 // This is required due to a bug in an old xml parser 00128 else if ( $xmlAttribute->name() == 'charset' ) 00129 $charset = $xmlAttribute->content(); 00130 } 00131 } 00132 00133 if ( $charset !== false ) 00134 { 00135 //include_once( 'lib/ezi18n/classes/eztextcodec.php' ); 00136 $codec = eZTextCodec::instance( $charset, false, false ); 00137 if ( $codec ) 00138 { 00139 $xmlDoc = $codec->convertString( $xmlDoc ); 00140 } 00141 } 00142 00143 $xmlDoc = preg_replace( "#<\?.*?\?>#", "", $xmlDoc ); 00144 00145 // get document version 00146 $xmlDoc = preg_replace( "%<\!DOCTYPE.*?>%is", "", $xmlDoc ); 00147 00148 // convert all newline types to unix newlines 00149 $xmlDoc = preg_replace( "#\n|\r\n|\r#", "\n", $xmlDoc ); 00150 00151 // strip comments 00152 $xmlDoc = $this->stripComments( $xmlDoc ); 00153 00154 // libxml compatible object creation 00155 $domDocument = new eZDOMDocument( '', $params["SetParentNode"] ); 00156 00157 $this->DOMDocument =& $domDocument; 00158 $currentNode =& $domDocument; 00159 00160 $defaultNamespace = ""; 00161 00162 $pos = 0; 00163 $endTagPos = 0; 00164 while ( $pos < strlen( $xmlDoc ) ) 00165 { 00166 $char = $xmlDoc[$pos]; 00167 if ( $char == "<" ) 00168 { 00169 // find tag name 00170 $endTagPos = strpos( $xmlDoc, ">", $pos ); 00171 00172 // tag name with attributes 00173 $tagName = substr( $xmlDoc, $pos + 1, $endTagPos - ( $pos + 1 ) ); 00174 00175 // check if it's an endtag </tagname> 00176 if ( $tagName[0] == "/" ) 00177 { 00178 $lastNodeArray = array_pop( $TagStack ); 00179 $lastTag = $lastNodeArray["TagName"]; 00180 00181 $lastNode =& $lastNodeArray["ParentNodeObject"]; 00182 00183 unset( $currentNode ); 00184 $currentNode =& $lastNode; 00185 00186 $tagName = substr( $tagName, 1, strlen( $tagName ) ); 00187 00188 // strip out namespace; nameSpace:Name 00189 $colonPos = strpos( $tagName, ":" ); 00190 00191 if ( $colonPos > 0 ) 00192 $tagName = substr( $tagName, $colonPos + 1, strlen( $tagName ) ); 00193 00194 if ( $lastTag != $tagName ) 00195 { 00196 eZDebug::writeError( "Error parsing XML, unmatched tags $tagName" ); 00197 $retVal = false; 00198 return $retVal; 00199 } 00200 else 00201 { 00202 // print( "endtag name: $tagName ending: $lastTag <br> " ); 00203 } 00204 } 00205 else 00206 { 00207 $firstSpaceEnd = strpos( $tagName, " " ); 00208 $firstNewlineEnd = strpos( $tagName, "\n" ); 00209 00210 if ( $firstNewlineEnd != false ) 00211 { 00212 if ( $firstSpaceEnd != false ) 00213 { 00214 $tagNameEnd = min( $firstSpaceEnd, $firstNewlineEnd ); 00215 } 00216 else 00217 { 00218 $tagNameEnd = $firstNewlineEnd; 00219 } 00220 } 00221 else 00222 { 00223 if ( $firstSpaceEnd != false ) 00224 { 00225 $tagNameEnd = $firstSpaceEnd; 00226 } 00227 else 00228 { 00229 $tagNameEnd = 0; 00230 } 00231 } 00232 00233 if ( $tagNameEnd > 0 ) 00234 { 00235 $justName = substr( $tagName, 0, $tagNameEnd ); 00236 } 00237 else 00238 $justName = $tagName; 00239 00240 00241 // strip out the namespace prefix 00242 // If $justname contains ![CDATA[ we should not set namespace prefix 00243 $colonPos = strpos( $justName, "![CDATA[" ) === false ? strpos( $justName, ":" ) : false; 00244 00245 $prefix = ""; 00246 if ( $colonPos > 0 ) 00247 { 00248 $prefix = substr( $justName, 0, $colonPos ); 00249 $justName = substr( $justName, $colonPos + 1, strlen( $justName ) ); 00250 } 00251 00252 00253 // remove trailing / from the name if exists 00254 if ( $justName[strlen($justName) - 1] == "/" ) 00255 { 00256 $justName = substr( $justName, 0, strlen( $justName ) - 1 ); 00257 } 00258 00259 00260 // create the new XML element node 00261 unset( $subNode ); 00262 $subNode = $domDocument->createElementNode( $justName ); 00263 00264 // find attributes 00265 if ( $tagNameEnd > 0 ) 00266 { 00267 unset( $attributePart ); 00268 $attributePart = substr( $tagName, $tagNameEnd, strlen( $tagName ) ); 00269 00270 // attributes 00271 unset( $attr ); 00272 $attr = $this->parseAttributes( $attributePart ); 00273 00274 if ( $attr != false ) 00275 $subNode->Attributes =& $attr; 00276 } 00277 00278 if ( $prefix != false ) 00279 { 00280 $subNode->Prefix = $prefix; 00281 00282 // find prefix 00283 if ( isSet( $this->NamespaceArray[$prefix] ) ) 00284 { 00285 $subNode->setNamespaceURI( $this->NamespaceArray[$prefix] ); 00286 } 00287 else 00288 { 00289 eZDebug::writeError( "Namespace: $prefix not defined", "eZ xml" ); 00290 } 00291 } 00292 else 00293 { 00294 // set the default namespace 00295 if ( isset( $this->NamespaceStack[0] ) ) 00296 { 00297 $subNode->setNamespaceURI( $this->NamespaceStack[0] ); 00298 } 00299 } 00300 00301 // check for CDATA 00302 $cdataSection = ""; 00303 $isCDATASection = false; 00304 $cdataPos = strpos( $xmlDoc, "<![CDATA[", $pos ); 00305 if ( $cdataPos == $pos && $pos > 0) 00306 { 00307 $isCDATASection = true; 00308 $endTagPos = strpos( $xmlDoc, "]]>", $cdataPos ); 00309 if ( $endTagPos == false ) 00310 { 00311 eZDebug::writeError( "XML parser error: Closing tag \']]>\' for <![CDATA[ not found" , "eZ xml" ); 00312 $endTagPos = strlen($xmlDoc); 00313 } 00314 $cdataSection = substr( $xmlDoc, $cdataPos + 9, $endTagPos - ( $cdataPos + 9 ) ); 00315 00316 // new CDATA node 00317 $subNode->Name = $subNode->LocalName = "#cdata-section"; 00318 $subNode->Content = $cdataSection; 00319 $subNode->Type = eZDOMNode::TYPE_CDATASECTION; 00320 00321 $pos = $endTagPos; 00322 $endTagPos += 2; 00323 } 00324 else 00325 { 00326 // element start tag 00327 //$subNode->Name = $justName; 00328 //$subNode->LocalName = $justName; 00329 //$subNode->Type = eZDOMNode::TYPE_ELEMENT; 00330 00331 $domDocument->registerElement( $subNode ); 00332 } 00333 00334 00335 $currentNode->appendChild( $subNode ); 00336 00337 00338 // check it it's a oneliner: <tagname /> or a cdata section 00339 if ( $isCDATASection == false ) 00340 if ( $tagName[strlen($tagName) - 1] != "/" ) 00341 { 00342 $TagStack[] = array( "TagName" => $justName, "ParentNodeObject" => &$currentNode ); 00343 00344 unset( $currentNode ); 00345 $currentNode =& $subNode; 00346 } 00347 } 00348 } 00349 00350 $pos = strpos( $xmlDoc, "<", $pos + 1 ); 00351 00352 if ( $pos == false ) 00353 { 00354 // end of document 00355 $pos = strlen( $xmlDoc ); 00356 } 00357 else 00358 { 00359 // content tag 00360 $tagContent = substr( $xmlDoc, $endTagPos + 1, $pos - ( $endTagPos + 1 ) ); 00361 00362 // Keep the whitespace consistent, parsing back and forward shouldn't change data 00363 $tagContent = preg_replace( "#[\n]+[\s]*$#", "", $tagContent, 1 ); 00364 00365 if ( ( $params["TrimWhiteSpace"] == true and trim( $tagContent ) != "" ) or ( $params["TrimWhiteSpace"] == false and $tagContent != "" ) ) 00366 { 00367 // convert special chars 00368 if ( $params["ConvertSpecialChars"] == true ) 00369 { 00370 $tagContent = str_replace(">", ">", $tagContent ); 00371 $tagContent = str_replace("<", "<", $tagContent ); 00372 $tagContent = str_replace("'", "'", $tagContent ); 00373 $tagContent = str_replace(""", '"', $tagContent ); 00374 $tagContent = str_replace("&", "&", $tagContent ); 00375 } 00376 00377 unset( $subNode ); 00378 $subNode = $domDocument->createTextNode( $tagContent ); 00379 00380 $domDocument->registerElement( $subNode ); 00381 $currentNode->appendChild( $subNode ); 00382 } 00383 } 00384 } 00385 if ( !$domDocument->Root ) 00386 { 00387 $tmp = null; 00388 return $tmp; 00389 } 00390 00391 return $domDocument; 00392 } 00393 00394 /*! 00395 \static 00396 \private 00397 */ 00398 function stripComments( &$str ) 00399 { 00400 return preg_replace( "#<\!--.*?-->#s", "", $str ); 00401 } 00402 00403 /*! 00404 \private 00405 Parses the attributes. Returns false if no attributes in the supplied string is found. 00406 */ 00407 function parseAttributes( $attributeString ) 00408 { 00409 $ret = false; 00410 00411 preg_match_all( "/([a-zA-Z0-9:_-]+\s*=\s*(\"|').*?(\\2))/i", $attributeString, $attributeArray ); 00412 00413 foreach ( $attributeArray[0] as $attributePart ) 00414 { 00415 if ( trim( $attributePart ) != "" && trim( $attributePart ) != "/" ) 00416 { 00417 $attributeNamespaceURI = false; 00418 $attributePrefix = false; 00419 $attributeTmpArray = preg_split ("#\s*(=\s*(\"|'))#", $attributePart ); 00420 00421 $attributeName = $attributeTmpArray[0]; 00422 00423 // strip out namespace; nameSpace:Name 00424 $colonPos = strpos( $attributeName, ":" ); 00425 00426 if ( $colonPos > 0 ) 00427 { 00428 $attributePrefix = substr( $attributeName, 0, $colonPos ); 00429 $attributeName = substr( $attributeName, $colonPos + 1, strlen( $attributeName ) ); 00430 } 00431 else 00432 { 00433 $attributePrefix = false; 00434 } 00435 00436 $attributeValue = $attributeTmpArray[1]; 00437 00438 // remove " from value part 00439 $attributeValue = substr( $attributeValue, 0, strlen( $attributeValue ) - 1); 00440 00441 $attributeValue = str_replace( ">", ">", $attributeValue ); 00442 $attributeValue = str_replace( "<", "<", $attributeValue ); 00443 $attributeValue = str_replace( "'", "'", $attributeValue ); 00444 $attributeValue = str_replace( """, '"', $attributeValue ); 00445 $attributeValue = str_replace( "&", "&", $attributeValue ); 00446 00447 // check for namespace definition 00448 if ( $attributePrefix == "xmlns" ) 00449 { 00450 $attributeNamespaceURI = $attributeValue; 00451 $this->NamespaceArray[$attributeName] = $attributeValue; 00452 00453 $this->DOMDocument->registerNamespaceAlias( $attributeName, $attributeValue ); 00454 } 00455 00456 // check for default namespace definition 00457 if ( $attributeName == "xmlns" ) 00458 { 00459 $attributeNamespaceURI = $attributeValue; 00460 00461 // change the default namespace 00462 $this->NamespaceStack[] = $attributeNamespaceURI; 00463 } 00464 00465 unset( $attrNode ); 00466 $attrNode = new eZDOMNode(); 00467 $attrNode->Name = $attributeName; 00468 00469 if ( $attributePrefix != false && $attributePrefix != "xmlns" ) 00470 { 00471 $attrNode->Prefix = $attributePrefix; 00472 $attrNode->LocalName = $attributeName; 00473 00474 // find prefix 00475 if ( isSet( $this->NamespaceArray["$attributePrefix"] ) ) 00476 { 00477 $attrNode->NamespaceURI = $this->NamespaceArray["$attributePrefix"]; 00478 } 00479 else 00480 { 00481 eZDebug::writeError( "Namespace: $attributePrefix not found", "eZ xml" ); 00482 } 00483 } 00484 else if ( $attributePrefix == "xmlns" ) 00485 { 00486 $attrNode->LocalName = $attributeName; 00487 $attrNode->NamespaceURI = $attributeNamespaceURI; 00488 $attrNode->Prefix = $attributePrefix; 00489 } 00490 else 00491 { 00492 // check for default namespace definition 00493 if ( $attributeName == "xmlns" ) 00494 { 00495 $attrNode->LocalName = $attributeName; 00496 $attrNode->NamespaceURI = $attributeNamespaceURI; 00497 } 00498 else 00499 { 00500 $attrNode->NamespaceURI = false; 00501 $attrNode->LocalName = false; 00502 } 00503 $attrNode->Prefix = false; 00504 } 00505 00506 $attrNode->Type = eZDOMNode::TYPE_ATTRIBUTE; 00507 $attrNode->Content = $attributeValue; 00508 00509 00510 $ret[] = $attrNode; 00511 00512 } 00513 } 00514 return $ret; 00515 } 00516 00517 /// Contains the namespaces 00518 public $NamespaceStack = array(); 00519 00520 /// Contains the available namespaces 00521 public $NamespaceArray = array(); 00522 00523 /// Contains the current namespace 00524 public $CurrentNameSpace; 00525 00526 /// Contains a reference to the DOM document object 00527 public $DOMDocument; 00528 } 00529 00530 ?>