eZ Publish  [4.0]
ezxml.php
Go to the documentation of this file.
00001 <?php
00002 //
00003 // $Id$
00004 //
00005 // Definition of eZXML class
00006 //
00007 // Created on: <13-Feb-2002 09:15:42 bf>
00008 //
00009 // ## BEGIN COPYRIGHT, LICENSE AND WARRANTY NOTICE ##
00010 // SOFTWARE NAME: eZ Publish
00011 // SOFTWARE RELEASE: 4.0.x
00012 // COPYRIGHT NOTICE: Copyright (C) 1999-2008 eZ Systems AS
00013 // SOFTWARE LICENSE: GNU General Public License v2.0
00014 // NOTICE: >
00015 //   This program is free software; you can redistribute it and/or
00016 //   modify it under the terms of version 2.0  of the GNU General
00017 //   Public License as published by the Free Software Foundation.
00018 //
00019 //   This program is distributed in the hope that it will be useful,
00020 //   but WITHOUT ANY WARRANTY; without even the implied warranty of
00021 //   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00022 //   GNU General Public License for more details.
00023 //
00024 //   You should have received a copy of version 2.0 of the GNU General
00025 //   Public License along with this program; if not, write to the Free
00026 //   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00027 //   MA 02110-1301, USA.
00028 //
00029 //
00030 // ## END COPYRIGHT, LICENSE AND WARRANTY NOTICE ##
00031 //
00032 
00033 /*! \file ezxml.php
00034   XML DOM parser.
00035 */
00036 
00037 /*! \defgroup eZXML XML parser and DOM library */
00038 
00039 /*!
00040   \class eZXML ezxml.php
00041   \ingroup eZXML
00042   \brief eZXML handles parsing of well formed XML documents.
00043 
00044 1  eZXML will create a DOM tree from well formed XML documents.
00045 
00046  \sa eZDOMDocument eZDOMNode
00047 */
00048 
00049 require_once( "lib/ezutils/classes/ezdebug.php" );
00050 //include_once( "lib/ezxml/classes/ezdomnode.php" );
00051 //include_once( "lib/ezxml/classes/ezdomdocument.php" );
00052 
00053 class eZXML
00054 {
00055     /*!
00056       Constructor
00057     */
00058     function eZXML( )
00059     {
00060 
00061     }
00062 
00063     /*!
00064       Will return a DOM object tree from the well formed XML.
00065 
00066       $params["SetParentNode"] = false/true : create eZDOMDocument with setParentNode parameter set to true or false.
00067       $params["TrimWhiteSpace"] = false/true : should the XML parser ignore whitespaces between tags.
00068       $params["CharsetConversion"] = false/true : Whether charset conversion is done or not, default is true.
00069       $params["ConvertSpecialChars"] = false/true: whether to convert &lt; &gt; &amp; etc into < > &; default is true.
00070     */
00071     function domTree( $xmlDoc, $params = array(), $native = false )
00072     {
00073         if ( !$xmlDoc )
00074         {
00075             $tmp = null;
00076             return $tmp;
00077         }
00078 
00079         /* We remove all control chars from the text, although they
00080          * should have not be there in the first place. This is
00081          * iso-8859-1 and UTF-8 safe. Those characters might also never exist
00082          * in an XML document in the first place
00083          * (http://w3.org/TR/2004/REC-xml-20040204/#NT-Char) so it's safe to
00084          * remove them */
00085         $xmlDoc = preg_replace('/[\x00-\x08\x0b-\x0c\x0e-\x1f]/', '', $xmlDoc);
00086 
00087         if ( $native and function_exists( 'domxml_open_mem' ) )
00088         {
00089             $domDocument = domxml_open_mem( $xmlDoc );
00090             return $domDocument;
00091         }
00092 
00093         if ( !isset( $params["TrimWhiteSpace"] ) )
00094             $params["TrimWhiteSpace"] = true;
00095 
00096         if ( !isset( $params["SetParentNode"] ) )
00097             $params["SetParentNode"] = false;
00098 
00099         $schema = false;
00100         if ( isset( $params["Schema"] ) && get_class( $params["Schema"]  ) == "ezschema" )
00101         {
00102             $schema = $params["Schema"];
00103         }
00104         $charset = 'UTF-8';
00105         if ( isset( $params['CharsetConversion'] ) and
00106              !$params['CharsetConversion'] )
00107             $charset = false;
00108         if ( !isset( $params['ConvertSpecialChars'] ) )
00109         {
00110             $params['ConvertSpecialChars'] = true;
00111         }
00112 
00113         $TagStack = array();
00114 
00115         $xmlAttributes = array();
00116 
00117         // strip header
00118         if ( preg_match( "#<\?xml(.*?)\?>#", $xmlDoc, $matches ) )
00119         {
00120             $xmlAttributeText = $matches[1];
00121             $xmlAttributes = $this->parseAttributes( $xmlAttributeText );
00122             for ( $i = 0; $i < count( $xmlAttributes ); ++$i )
00123             {
00124                 $xmlAttribute =& $xmlAttributes[$i];
00125                 if ( $xmlAttribute->name() == 'encoding' )
00126                     $charset = $xmlAttribute->content();
00127                 // This is required due to a bug in an old xml parser
00128                 else if ( $xmlAttribute->name() == 'charset' )
00129                     $charset = $xmlAttribute->content();
00130             }
00131         }
00132 
00133         if ( $charset !== false )
00134         {
00135             //include_once( 'lib/ezi18n/classes/eztextcodec.php' );
00136             $codec = eZTextCodec::instance( $charset, false, false );
00137             if ( $codec )
00138             {
00139                 $xmlDoc = $codec->convertString( $xmlDoc );
00140             }
00141         }
00142 
00143         $xmlDoc = preg_replace( "#<\?.*?\?>#", "", $xmlDoc );
00144 
00145         // get document version
00146         $xmlDoc = preg_replace( "%<\!DOCTYPE.*?>%is", "", $xmlDoc );
00147 
00148         // convert all newline types to unix newlines
00149         $xmlDoc = preg_replace( "#\n|\r\n|\r#", "\n", $xmlDoc );
00150 
00151         // strip comments
00152         $xmlDoc = $this->stripComments( $xmlDoc );
00153 
00154         // libxml compatible object creation
00155         $domDocument = new eZDOMDocument( '', $params["SetParentNode"] );
00156 
00157         $this->DOMDocument =& $domDocument;
00158         $currentNode =& $domDocument;
00159 
00160         $defaultNamespace = "";
00161 
00162         $pos = 0;
00163         $endTagPos = 0;
00164         while ( $pos < strlen( $xmlDoc ) )
00165         {
00166             $char = $xmlDoc[$pos];
00167             if ( $char == "<" )
00168             {
00169                 // find tag name
00170                 $endTagPos = strpos( $xmlDoc, ">", $pos );
00171 
00172                 // tag name with attributes
00173                 $tagName = substr( $xmlDoc, $pos + 1, $endTagPos - ( $pos + 1 ) );
00174 
00175                 // check if it's an endtag </tagname>
00176                 if ( $tagName[0] == "/" )
00177                 {
00178                     $lastNodeArray = array_pop( $TagStack );
00179                     $lastTag = $lastNodeArray["TagName"];
00180 
00181                     $lastNode =& $lastNodeArray["ParentNodeObject"];
00182 
00183                     unset( $currentNode );
00184                     $currentNode =& $lastNode;
00185 
00186                     $tagName = substr( $tagName, 1, strlen( $tagName ) );
00187 
00188                     // strip out namespace; nameSpace:Name
00189                     $colonPos = strpos( $tagName, ":" );
00190 
00191                     if ( $colonPos > 0 )
00192                         $tagName = substr( $tagName, $colonPos + 1, strlen( $tagName ) );
00193 
00194                     if ( $lastTag != $tagName )
00195                     {
00196                         eZDebug::writeError( "Error parsing XML, unmatched tags $tagName" );
00197                         $retVal = false;
00198                         return $retVal;
00199                     }
00200                     else
00201                     {
00202                         //    print( "endtag name: $tagName ending: $lastTag <br> " );
00203                     }
00204                 }
00205                 else
00206                 {
00207                     $firstSpaceEnd = strpos( $tagName, " " );
00208                     $firstNewlineEnd = strpos( $tagName, "\n" );
00209 
00210                     if ( $firstNewlineEnd != false )
00211                     {
00212                         if ( $firstSpaceEnd != false )
00213                         {
00214                             $tagNameEnd = min( $firstSpaceEnd, $firstNewlineEnd );
00215                         }
00216                         else
00217                         {
00218                             $tagNameEnd = $firstNewlineEnd;
00219                         }
00220                     }
00221                     else
00222                     {
00223                         if ( $firstSpaceEnd != false )
00224                         {
00225                             $tagNameEnd = $firstSpaceEnd;
00226                         }
00227                         else
00228                         {
00229                             $tagNameEnd = 0;
00230                         }
00231                     }
00232 
00233                     if ( $tagNameEnd > 0 )
00234                     {
00235                         $justName = substr( $tagName, 0, $tagNameEnd );
00236                     }
00237                     else
00238                         $justName = $tagName;
00239 
00240 
00241                     // strip out the namespace prefix
00242                     // If $justname contains ![CDATA[ we should not set namespace prefix
00243                     $colonPos = strpos( $justName, "![CDATA[" ) === false ? strpos( $justName, ":" ) : false;
00244 
00245                     $prefix = "";
00246                     if ( $colonPos > 0 )
00247                     {
00248                         $prefix = substr( $justName, 0, $colonPos );
00249                         $justName = substr( $justName, $colonPos + 1, strlen( $justName ) );
00250                     }
00251 
00252 
00253                     // remove trailing / from the name if exists
00254                     if ( $justName[strlen($justName) - 1]  == "/" )
00255                     {
00256                         $justName = substr( $justName, 0, strlen( $justName ) - 1 );
00257                     }
00258 
00259 
00260                     // create the new XML element node
00261                     unset( $subNode );
00262                     $subNode = $domDocument->createElementNode( $justName );
00263 
00264                     // find attributes
00265                     if ( $tagNameEnd > 0 )
00266                     {
00267                         unset( $attributePart );
00268                         $attributePart = substr( $tagName, $tagNameEnd, strlen( $tagName ) );
00269 
00270                         // attributes
00271                         unset( $attr );
00272                         $attr = $this->parseAttributes( $attributePart );
00273 
00274                         if ( $attr != false )
00275                             $subNode->Attributes =& $attr;
00276                     }
00277 
00278                     if ( $prefix != false  )
00279                     {
00280                         $subNode->Prefix = $prefix;
00281 
00282                         // find prefix
00283                         if ( isSet( $this->NamespaceArray[$prefix] ) )
00284                         {
00285                             $subNode->setNamespaceURI( $this->NamespaceArray[$prefix] );
00286                         }
00287                         else
00288                         {
00289                             eZDebug::writeError( "Namespace: $prefix not defined", "eZ xml" );
00290                         }
00291                     }
00292                     else
00293                     {
00294                         // set the default namespace
00295                         if ( isset( $this->NamespaceStack[0] ) )
00296                         {
00297                             $subNode->setNamespaceURI( $this->NamespaceStack[0] );
00298                         }
00299                     }
00300 
00301                     // check for CDATA
00302                     $cdataSection = "";
00303                     $isCDATASection = false;
00304                     $cdataPos = strpos( $xmlDoc, "<![CDATA[", $pos );
00305                     if ( $cdataPos == $pos && $pos > 0)
00306                     {
00307                         $isCDATASection = true;
00308                         $endTagPos = strpos( $xmlDoc, "]]>", $cdataPos );
00309                         if ( $endTagPos == false )
00310                         {
00311                             eZDebug::writeError( "XML parser error: Closing tag \']]>\' for <![CDATA[ not found" , "eZ xml" );
00312                             $endTagPos = strlen($xmlDoc);
00313                         }
00314                         $cdataSection = substr( $xmlDoc, $cdataPos + 9, $endTagPos - ( $cdataPos + 9 ) );
00315 
00316                         // new CDATA node
00317                         $subNode->Name = $subNode->LocalName = "#cdata-section";
00318                         $subNode->Content = $cdataSection;
00319                         $subNode->Type = eZDOMNode::TYPE_CDATASECTION;
00320 
00321                         $pos = $endTagPos;
00322                         $endTagPos += 2;
00323                     }
00324                     else
00325                     {
00326                         // element start tag
00327                         //$subNode->Name = $justName;
00328                         //$subNode->LocalName = $justName;
00329                         //$subNode->Type = eZDOMNode::TYPE_ELEMENT;
00330 
00331                         $domDocument->registerElement( $subNode );
00332                     }
00333 
00334 
00335                     $currentNode->appendChild( $subNode );
00336 
00337 
00338                     // check it it's a oneliner: <tagname /> or a cdata section
00339                     if ( $isCDATASection == false )
00340                         if ( $tagName[strlen($tagName) - 1]  != "/" )
00341                         {
00342                             $TagStack[] = array( "TagName" => $justName, "ParentNodeObject" => &$currentNode );
00343 
00344                             unset( $currentNode );
00345                             $currentNode =& $subNode;
00346                         }
00347                 }
00348             }
00349 
00350             $pos = strpos( $xmlDoc, "<", $pos + 1 );
00351 
00352             if ( $pos == false )
00353             {
00354                 // end of document
00355                 $pos = strlen( $xmlDoc );
00356             }
00357             else
00358             {
00359                 // content tag
00360                 $tagContent = substr( $xmlDoc, $endTagPos + 1, $pos - ( $endTagPos + 1 ) );
00361 
00362                 // Keep the whitespace consistent, parsing back and forward shouldn't change data
00363                 $tagContent = preg_replace( "#[\n]+[\s]*$#", "", $tagContent, 1 );
00364 
00365                 if ( ( $params["TrimWhiteSpace"] == true and trim( $tagContent ) != "" ) or ( $params["TrimWhiteSpace"] == false and $tagContent != "" ) )
00366                 {
00367                     // convert special chars
00368                     if ( $params["ConvertSpecialChars"] == true )
00369                     {
00370                         $tagContent = str_replace("&gt;", ">", $tagContent );
00371                         $tagContent = str_replace("&lt;", "<", $tagContent );
00372                         $tagContent = str_replace("&apos;", "'", $tagContent );
00373                         $tagContent = str_replace("&quot;", '"', $tagContent );
00374                         $tagContent = str_replace("&amp;", "&", $tagContent );
00375                     }
00376 
00377                     unset( $subNode );
00378                     $subNode = $domDocument->createTextNode( $tagContent );
00379 
00380                     $domDocument->registerElement( $subNode );
00381                     $currentNode->appendChild( $subNode );
00382                 }
00383             }
00384         }
00385         if ( !$domDocument->Root )
00386         {
00387             $tmp = null;
00388             return $tmp;
00389         }
00390 
00391         return $domDocument;
00392     }
00393 
00394     /*!
00395       \static
00396       \private
00397     */
00398     function stripComments( &$str )
00399     {
00400         return preg_replace( "#<\!--.*?-->#s", "", $str );
00401     }
00402 
00403     /*!
00404       \private
00405       Parses the attributes. Returns false if no attributes in the supplied string is found.
00406     */
00407     function parseAttributes( $attributeString )
00408     {
00409         $ret = false;
00410 
00411         preg_match_all( "/([a-zA-Z0-9:_-]+\s*=\s*(\"|').*?(\\2))/i",  $attributeString, $attributeArray );
00412 
00413         foreach ( $attributeArray[0] as $attributePart )
00414         {
00415             if ( trim( $attributePart ) != "" && trim( $attributePart ) != "/" )
00416             {
00417                 $attributeNamespaceURI = false;
00418                 $attributePrefix = false;
00419                 $attributeTmpArray = preg_split ("#\s*(=\s*(\"|'))#", $attributePart );
00420 
00421                 $attributeName = $attributeTmpArray[0];
00422 
00423                 // strip out namespace; nameSpace:Name
00424                 $colonPos = strpos( $attributeName, ":" );
00425 
00426                 if ( $colonPos > 0 )
00427                 {
00428                     $attributePrefix = substr( $attributeName, 0, $colonPos );
00429                     $attributeName = substr( $attributeName, $colonPos + 1, strlen( $attributeName ) );
00430                 }
00431                 else
00432                 {
00433                     $attributePrefix = false;
00434                 }
00435 
00436                 $attributeValue = $attributeTmpArray[1];
00437 
00438                 // remove " from value part
00439                 $attributeValue = substr( $attributeValue, 0, strlen( $attributeValue ) - 1);
00440 
00441                 $attributeValue = str_replace( "&gt;", ">", $attributeValue );
00442                 $attributeValue = str_replace( "&lt;", "<", $attributeValue );
00443                 $attributeValue = str_replace( "&apos;", "'", $attributeValue );
00444                 $attributeValue = str_replace( "&quot;", '"', $attributeValue );
00445                 $attributeValue = str_replace( "&amp;", "&", $attributeValue );
00446 
00447                 // check for namespace definition
00448                 if ( $attributePrefix == "xmlns" )
00449                 {
00450                     $attributeNamespaceURI = $attributeValue;
00451                     $this->NamespaceArray[$attributeName] = $attributeValue;
00452 
00453                     $this->DOMDocument->registerNamespaceAlias( $attributeName, $attributeValue );
00454                 }
00455 
00456                 // check for default namespace definition
00457                 if ( $attributeName == "xmlns" )
00458                 {
00459                     $attributeNamespaceURI = $attributeValue;
00460 
00461                     // change the default namespace
00462                     $this->NamespaceStack[] = $attributeNamespaceURI;
00463                 }
00464 
00465                 unset( $attrNode );
00466                 $attrNode = new eZDOMNode();
00467                 $attrNode->Name = $attributeName;
00468 
00469                 if ( $attributePrefix != false && $attributePrefix != "xmlns" )
00470                 {
00471                     $attrNode->Prefix = $attributePrefix;
00472                     $attrNode->LocalName = $attributeName;
00473 
00474                     // find prefix
00475                     if ( isSet( $this->NamespaceArray["$attributePrefix"] ) )
00476                     {
00477                         $attrNode->NamespaceURI = $this->NamespaceArray["$attributePrefix"];
00478                     }
00479                     else
00480                     {
00481                         eZDebug::writeError( "Namespace: $attributePrefix not found", "eZ xml" );
00482                     }
00483                 }
00484                 else if ( $attributePrefix == "xmlns" )
00485                 {
00486                     $attrNode->LocalName = $attributeName;
00487                     $attrNode->NamespaceURI = $attributeNamespaceURI;
00488                     $attrNode->Prefix = $attributePrefix;
00489                 }
00490                 else
00491                 {
00492                     // check for default namespace definition
00493                     if ( $attributeName == "xmlns" )
00494                     {
00495                         $attrNode->LocalName = $attributeName;
00496                         $attrNode->NamespaceURI = $attributeNamespaceURI;
00497                     }
00498                     else
00499                     {
00500                         $attrNode->NamespaceURI = false;
00501                         $attrNode->LocalName = false;
00502                     }
00503                     $attrNode->Prefix = false;
00504                 }
00505 
00506                 $attrNode->Type = eZDOMNode::TYPE_ATTRIBUTE;
00507                 $attrNode->Content = $attributeValue;
00508 
00509 
00510                 $ret[] = $attrNode;
00511 
00512             }
00513         }
00514         return $ret;
00515     }
00516 
00517     /// Contains the namespaces
00518     public $NamespaceStack = array();
00519 
00520     /// Contains the available namespaces
00521     public $NamespaceArray = array();
00522 
00523     /// Contains the current namespace
00524     public $CurrentNameSpace;
00525 
00526     /// Contains a reference to the DOM document object
00527     public $DOMDocument;
00528 }
00529 
00530 ?>