eZ Publish  [4.0]
ezxmlinputparser.php
Go to the documentation of this file.
00001 <?php
00002 //
00003 // Definition of eZXMLInputParser class
00004 //
00005 // Created on: <27-Mar-2006 15:28:39 ks>
00006 //
00007 // ## BEGIN COPYRIGHT, LICENSE AND WARRANTY NOTICE ##
00008 // SOFTWARE NAME: eZ Publish
00009 // SOFTWARE RELEASE: 4.0.x
00010 // COPYRIGHT NOTICE: Copyright (C) 1999-2008 eZ Systems AS
00011 // SOFTWARE LICENSE: GNU General Public License v2.0
00012 // NOTICE: >
00013 //   This program is free software; you can redistribute it and/or
00014 //   modify it under the terms of version 2.0  of the GNU General
00015 //   Public License as published by the Free Software Foundation.
00016 //
00017 //   This program is distributed in the hope that it will be useful,
00018 //   but WITHOUT ANY WARRANTY; without even the implied warranty of
00019 //   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020 //   GNU General Public License for more details.
00021 //
00022 //   You should have received a copy of version 2.0 of the GNU General
00023 //   Public License along with this program; if not, write to the Free
00024 //   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00025 //   MA 02110-1301, USA.
00026 //
00027 //
00028 // ## END COPYRIGHT, LICENSE AND WARRANTY NOTICE ##
00029 //
00030 
00031 /*
00032     Base class for the input parser.
00033     The goal of the parser is XML/HTML analyzing, fixing and transforming.
00034     The input is processed in 2 passes:
00035     - 1st pass: Parsing input, check for syntax errors, build DOM tree.
00036     - 2nd pass: Walking through DOM tree, checking validity by XML schema,
00037                 calling tag handlers to transform the tree.
00038 
00039     Both passes are controlled by the arrays described bellow and user handler functions.
00040 
00041 */
00042 
00043 // if ( !class_exists( 'eZXMLSchema' ) ) // AS 21-09-2007: commented out because of include_once being commented out
00044     //include_once( 'kernel/classes/datatypes/ezxmltext/ezxmlschema.php' );
00045 
00046 class eZXMLInputParser
00047 {
00048     /// \deprecated (back-compatibility)
00049     const SHOW_NO_ERRORS = 0;
00050     const SHOW_SCHEMA_ERRORS = 1;
00051     const SHOW_ALL_ERRORS = 2;
00052 
00053     /// Use these constants for error types
00054     const ERROR_NONE = 0;
00055     const ERROR_SYNTAX = 4;
00056     const ERROR_SCHEMA = 8;
00057     const ERROR_DATA = 16;
00058     const ERROR_ALL = 28; // 4+8+16
00059 
00060     /* $InputTags array contains properties of elements that come from the input.
00061 
00062     Each array element describes a tag that comes from the input. Arrays index is
00063     a tag's name. Each element is an array that may contain the following members:
00064 
00065     'name'        - a string representing a new name of the tag,
00066     'nameHandler' - a name of the function that returns new tag name. Function format:
00067                     function tagNameHandler( $tagName, &$attributes )
00068 
00069     If no of those elements are defined the original tag's name is used.
00070 
00071     'noChildren'  - boolean value that determines if this tag could have child tags,
00072                     default value is false.
00073 
00074     Example:
00075 
00076     public $InputTags = array(
00077 
00078         'original-name' => array( 'name' => 'new-name' ),
00079 
00080         'original-name2' => array( 'nameHandler' => 'tagNameHandler',
00081                                    'noChildren' => true ),
00082 
00083          ...
00084 
00085          );
00086     */
00087 
00088     public $InputTags = array();
00089 
00090     /*
00091     $OutputTags array contains properties of elements that are produced in the output.
00092     Each array element describes a tag presented in the output. Arrays index is
00093     a tag's name. Each element is an array that may contain the following members:
00094 
00095     'parsingHandler' - "Parsing handler" called at parse pass 1 before processing tag's children.
00096     'initHandler'    - "Init handler" called at pass 2 before proccessing tag's children.
00097     'structHandler'  - "Structure handler" called at pass 2 after proccessing tag's children,
00098                        but before schema validity check. It can be used to implement structure
00099                        transformations.
00100     'publishHandler' - "Publish handler" called at pass 2 after schema validity check, so it is called
00101                        in case the element has it's guaranteed place in the DOM tree.
00102 
00103     'attributes'     - an array that describes attributes transformations. Array's index is the
00104                        original name of an attribute, and the value is the new name.
00105 
00106     'requiredInputAttributes' - attributes that are required in the input tag. If they are not presented
00107                                 it raises invalid input flag.
00108 
00109     Example:
00110 
00111     public $OutputTags = array(
00112 
00113         'custom'    => array( 'parsingHandler' => 'parsingHandlerCustom',
00114                               'initHandler' => 'initHandlerCustom',
00115                               'structHandler' => 'structHandlerCustom',
00116                               'publishHandler' => 'publishHandlerCustom',
00117                               'attributes' => array( 'title' => 'name' ) ),
00118 
00119         ...
00120     );
00121 
00122     */
00123 
00124     public $OutputTags = array();
00125 
00126     public $Namespaces = array( 'image' => 'http://ez.no/namespaces/ezpublish3/image/',
00127                              'xhtml' => 'http://ez.no/namespaces/ezpublish3/xhtml/',
00128                              'custom' => 'http://ez.no/namespaces/ezpublish3/custom/',
00129                              'tmp' => 'http://ez.no/namespaces/ezpublish3/temporary/' );
00130 
00131     /*!
00132 
00133     The constructor.
00134 
00135     \param $validate
00136     \param $validateErrorLevel Determines types of errors that break input processing
00137                                It's possible to combine any error types, by creating a bitmask of EZ_XMLINPUTPARSER_ERROR_* constants.
00138                                \c true value means that all errors defined by $detectErrorLevel parameter will break further processing
00139     \param $detectErrorLevel Determines types of errors that will be detected and added to error log ($Messages).
00140     */
00141 
00142     function eZXMLInputParser( $validateErrorLevel = self::ERROR_NONE, $detectErrorLevel = self::ERROR_NONE, $parseLineBreaks = false,
00143                                $removeDefaultAttrs = false )
00144     {
00145         // Back-compatibility fixes:
00146         if ( $detectErrorLevel === self::SHOW_SCHEMA_ERRORS )
00147         {
00148             $detectErrorLevel = self::ERROR_SCHEMA;
00149         }
00150         elseif ( $detectErrorLevel === self::SHOW_ALL_ERRORS )
00151         {
00152             $detectErrorLevel = self::ERROR_ALL;
00153         }
00154 
00155         if ( $validateErrorLevel === false )
00156         {
00157             $validateErrorLevel = self::ERROR_NONE;
00158         }
00159         elseif ( $validateErrorLevel === true )
00160         {
00161             $validateErrorLevel = $detectErrorLevel;
00162         }
00163 
00164         $this->ValidateErrorLevel = $validateErrorLevel;
00165         $this->DetectErrorLevel = $detectErrorLevel;
00166 
00167         $this->RemoveDefaultAttrs = $removeDefaultAttrs;
00168         $this->ParseLineBreaks = $parseLineBreaks;
00169 
00170         $this->XMLSchema = eZXMLSchema::instance();
00171 
00172         //include_once( 'lib/version.php' );
00173         $this->eZPublishVersion = eZPublishSDK::majorVersion() + eZPublishSDK::minorVersion() * 0.1;
00174 
00175         $ini = eZINI::instance( 'ezxml.ini' );
00176         if ( $ini->hasVariable( 'InputSettings', 'TrimSpaces' ) )
00177         {
00178             $trimSpaces = $ini->variable( 'InputSettings', 'TrimSpaces' );
00179             $this->TrimSpaces = $trimSpaces == 'true' ? true : false;
00180         }
00181 
00182         if ( $ini->hasVariable( 'InputSettings', 'AllowMultipleSpaces' ) )
00183         {
00184             $allowMultipleSpaces = $ini->variable( 'InputSettings', 'AllowMultipleSpaces' );
00185             $this->AllowMultipleSpaces = $allowMultipleSpaces == 'true' ? true : false;
00186         }
00187 
00188         if ( $ini->hasVariable( 'InputSettings', 'AllowNumericEntities' ) )
00189         {
00190             $allowNumericEntities = $ini->variable( 'InputSettings', 'AllowNumericEntities' );
00191             $this->AllowNumericEntities = $allowNumericEntities == 'true' ? true : false;
00192         }
00193 
00194         $contentIni = eZINI::instance( 'content.ini' );
00195         $useStrictHeaderRule = $contentIni->variable( 'header', 'UseStrictHeaderRule' );
00196         $this->StrictHeaders = $useStrictHeaderRule == 'true' ? true : false;
00197     }
00198 
00199     /// \public
00200     function setDOMDocumentClass( $DOMDocumentClass )
00201     {
00202         $this->DOMDocumentClass = $DOMDocumentClass;
00203     }
00204 
00205     /// \public
00206     function setParseLineBreaks( $value )
00207     {
00208         $this->ParseLineBreaks = $value;
00209     }
00210 
00211     /// \public
00212     function setRemoveDefaultAttrs( $value )
00213     {
00214         $this->RemoveDefaultAttrs = $value;
00215     }
00216 
00217     /// \public
00218     function createRootNode()
00219     {
00220         if ( !$this->Document )
00221         {
00222             $this->Document = new $this->DOMDocumentClass( '1.0', 'utf-8' );
00223         }
00224 
00225         // Creating root section with namespaces definitions
00226         $mainSection = $this->Document->createElement( 'section' );
00227         $this->Document->appendChild( $mainSection );
00228         foreach( array( 'image', 'xhtml', 'custom' ) as $prefix )
00229         {
00230             $mainSection->setAttributeNS( 'http://www.w3.org/2000/xmlns/', 'xmlns:' . $prefix, $this->Namespaces[$prefix] );
00231         }
00232         return $this->Document;
00233     }
00234 
00235     /*!
00236         \public
00237         Call this function to process your input
00238     */
00239     function process( $text, $createRootNode = true )
00240     {
00241         $text = str_replace( "\r", '', $text);
00242         $text = str_replace( "\t", ' ', $text);
00243         if ( !$this->ParseLineBreaks )
00244         {
00245             $text = str_replace( "\n", '', $text);
00246         }
00247 
00248         $this->Document = new $this->DOMDocumentClass( '1.0', 'utf-8' );
00249 
00250         if ( $createRootNode )
00251         {
00252             $this->createRootNode();
00253         }
00254 
00255         // Perform pass 1
00256         // Parsing the source string
00257         $this->performPass1( $text );
00258 
00259         //$this->Document->formatOutput = true;
00260         eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'XML after pass 1' );
00261 
00262         if ( $this->QuitProcess )
00263         {
00264             return false;
00265         }
00266 
00267         // Perform pass 2
00268         $this->performPass2();
00269 
00270         //$this->Document->formatOutput = true;
00271         eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'XML after pass 2' );
00272 
00273         if ( $this->QuitProcess )
00274         {
00275             return false;
00276         }
00277 
00278         return $this->Document;
00279     }
00280 
00281     /*
00282        \public
00283        Pass 1: Parsing the source HTML string.
00284     */
00285 
00286     function performPass1( &$data )
00287     {
00288         $ret = true;
00289         $pos = 0;
00290 
00291         if ( $this->Document->documentElement )
00292         {
00293             do
00294             {
00295                 $this->parseTag( $data, $pos, $this->Document->documentElement );
00296                 if ( $this->QuitProcess )
00297                 {
00298                     $ret = false;
00299                     break;
00300                 }
00301 
00302             }
00303             while( $pos < strlen( $data ) );
00304         }
00305         else
00306         {
00307             $tmp = null;
00308             $this->parseTag( $data, $pos, $tmp );
00309             if ( $this->QuitProcess )
00310             {
00311                 $ret = false;
00312             }
00313         }
00314         return $ret;
00315     }
00316 
00317     // The main recursive function for pass 1
00318 
00319     function parseTag( &$data, &$pos, &$parent )
00320     {
00321         // Find tag, determine it's type, name and attributes.
00322         $initialPos = $pos;
00323 
00324         if ( $pos >= strlen( $data ) )
00325         {
00326             return true;
00327         }
00328         $tagBeginPos = strpos( $data, '<', $pos );
00329 
00330         if ( $this->ParseLineBreaks )
00331         {
00332             // Regard line break as a start tag position
00333             $lineBreakPos = strpos( $data, "\n", $pos );
00334             if ( $lineBreakPos !== false )
00335             {
00336                 $tagBeginPos = $tagBeginPos === false ? $lineBreakPos : min( $tagBeginPos, $lineBreakPos );
00337             }
00338         }
00339 
00340         $tagName = '';
00341         $attributes = null;
00342         // If it doesn't begin with '<' then its a text node.
00343         if ( $tagBeginPos != $pos || $tagBeginPos === false )
00344         {
00345             $pos = $initialPos;
00346             $tagName = $newTagName = '#text';
00347             $noChildren = true;
00348 
00349             if ( !$tagBeginPos )
00350             {
00351                 $tagBeginPos = strlen( $data );
00352             }
00353 
00354             $textContent = substr( $data, $pos, $tagBeginPos - $pos );
00355 
00356             $textContent = $this->washText( $textContent );
00357 
00358             $pos = $tagBeginPos;
00359             if ( $textContent === '' )
00360             {
00361                 return false;
00362             }
00363         }
00364         // Process closing tag.
00365         elseif ( $data[$tagBeginPos] == '<' && $tagBeginPos + 1 < strlen( $data ) &&
00366                  $data[$tagBeginPos + 1] == '/' )
00367         {
00368             $tagEndPos = strpos( $data, '>', $tagBeginPos + 1 );
00369             if ( $tagEndPos === false )
00370             {
00371                 $pos = $tagBeginPos + 1;
00372 
00373                 $this->handleError( self::ERROR_SYNTAX, ezi18n( 'kernel/classes/datatypes/ezxmltext', 'Wrong closing tag' ) );
00374                 return false;
00375             }
00376 
00377             $pos = $tagEndPos + 1;
00378             $closedTagName = strtolower( trim( substr( $data, $tagBeginPos + 2, $tagEndPos - $tagBeginPos - 2 ) ) );
00379 
00380             // Find matching tag in ParentStack array
00381             $firstLoop = true;
00382             for( $i = count( $this->ParentStack ) - 1; $i >= 0; $i-- )
00383             {
00384                 $parentNames = $this->ParentStack[$i];
00385                 if ( $parentNames[0] == $closedTagName )
00386                 {
00387                     array_pop( $this->ParentStack );
00388                     if ( !$firstLoop )
00389                     {
00390                         $pos = $tagBeginPos;
00391                         return true;
00392                     }
00393                     // If newTagName was '' we don't break children loop
00394                     elseif ( $parentNames[1] !== '' )
00395                     {
00396                         return true;
00397                     }
00398                     else
00399                     {
00400                         return false;
00401                     }
00402                 }
00403                 $firstLoop = false;
00404             }
00405 
00406             $this->handleError( self::ERROR_SYNTAX, ezi18n( 'kernel/classes/datatypes/ezxmltext', 'Wrong closing tag : &lt;/%1&gt;.', false, array( $closedTagName ) ) );
00407 
00408             return false;
00409         }
00410         // Insert <br/> instead of linebreaks
00411         elseif ( $this->ParseLineBreaks && $data[$tagBeginPos] == "\n" )
00412         {
00413             $newTagName = 'br';
00414             $noChildren = true;
00415             $pos = $tagBeginPos + 1;
00416         }
00417         //  Regular tag: get tag's name and attributes.
00418         else
00419         {
00420             $tagEndPos = strpos( $data, '>', $tagBeginPos );
00421             if ( $tagEndPos === false )
00422             {
00423                 $pos = $tagBeginPos + 1;
00424 
00425                 $this->handleError( self::ERROR_SYNTAX, ezi18n( 'kernel/classes/datatypes/ezxmltext', 'Wrong opening tag' ) );
00426                 return false;
00427             }
00428 
00429             $pos = $tagEndPos + 1;
00430             $tagString = substr( $data, $tagBeginPos + 1, $tagEndPos - $tagBeginPos - 1 );
00431             // Check for final backslash
00432             $noChildren = substr( $tagString, -1, 1 ) == '/' ? true : false;
00433             // Remove final backslash and spaces
00434             $tagString = preg_replace( "/\s*\/$/", "", $tagString );
00435 
00436             $firstSpacePos = strpos( $tagString, ' ' );
00437             if ( $firstSpacePos === false )
00438             {
00439                 $tagName = strtolower( trim( $tagString ) );
00440                 $attributeString = '';
00441             }
00442             else
00443             {
00444                 $tagName = strtolower( substr( $tagString, 0, $firstSpacePos ) );
00445                 $attributeString = substr( $tagString, $firstSpacePos + 1 );
00446                 $attributeString = trim( $attributeString );
00447                 // Parse attribute string
00448                 if ( $attributeString )
00449                 {
00450                     $attributes = $this->parseAttributes( $attributeString );
00451                 }
00452             }
00453 
00454             // Determine tag's name
00455             if ( isset( $this->InputTags[$tagName] ) )
00456             {
00457                 $thisInputTag = $this->InputTags[$tagName];
00458 
00459                 if ( isset( $thisInputTag['name'] ) )
00460                 {
00461                     $newTagName = $thisInputTag['name'];
00462                 }
00463                 else
00464                 {
00465                     $newTagName = $this->callInputHandler( 'nameHandler', $tagName, $attributes );
00466                 }
00467             }
00468             else
00469             {
00470                 if ( $this->XMLSchema->exists( $tagName ) )
00471                 {
00472                     $newTagName = $tagName;
00473                 }
00474                 else
00475                 {
00476                     $this->handleError( self::ERROR_SYNTAX, ezi18n( 'kernel/classes/datatypes/ezxmltext', 'Unknown tag: &lt;%1&gt;.', false, array( $tagName ) ) );
00477                     return false;
00478                 }
00479             }
00480 
00481             // Check 'noChildren' property
00482             if ( isset( $thisInputTag['noChildren'] ) )
00483             {
00484                 $noChildren = true;
00485             }
00486 
00487             $thisOutputTag = isset( $this->OutputTags[$newTagName] ) ? $this->OutputTags[$newTagName] : null;
00488 
00489             // Implementation of 'autoCloseOn' rule ( Handling of unclosed tags, ex.: <p>, <li> )
00490             if ( isset( $thisOutputTag['autoCloseOn'] ) &&
00491                  $parent &&
00492                  $parent->parentNode instanceof DOMElement &&
00493                  in_array( $parent->nodeName, $thisOutputTag['autoCloseOn'] ) )
00494             {
00495                 // Wrong nesting: auto-close parent and try to re-parse this tag at higher level
00496                 array_pop( $this->ParentStack );
00497                 $pos = $tagBeginPos;
00498                 return true;
00499             }
00500 
00501             // Append to parent stack
00502             if ( !$noChildren && $newTagName !== false )
00503             {
00504                 $this->ParentStack[] = array( $tagName, $newTagName, $attributeString );
00505             }
00506 
00507             if ( !$newTagName )
00508             {
00509                 // If $newTagName is an empty string then it's not a error
00510                 if ( $newTagName === false )
00511                     $this->handleError( self::ERROR_SYNTAX, ezi18n( 'kernel/classes/datatypes/ezxmltext', "Can't convert tag's name: &lt;%1&gt;.", false, array( $tagName ) ) );
00512 
00513                 return false;
00514             }
00515 
00516             // wordmatch.ini support
00517             if ( $attributeString )
00518             {
00519                 $attributes = $this->wordMatchSupport( $newTagName, $attributes, $attributeString );
00520             }
00521         }
00522 
00523         // Create text or normal node.
00524         if ( $newTagName == '#text' )
00525         {
00526             $element = $this->Document->createTextNode( $textContent );
00527         }
00528         else
00529         {
00530             $element = $this->Document->createElement( $newTagName );
00531         }
00532 
00533         if ( $attributes )
00534         {
00535             $this->setAttributes( $element, $attributes );
00536         }
00537 
00538         // Append element as a child or set it as root if there is no parent.
00539         if ( $parent )
00540         {
00541             $parent->appendChild( $element );
00542         }
00543         else
00544         {
00545             $this->Document->appendChild( $element );
00546         }
00547 
00548         $params = array();
00549         $params[] =& $data;
00550         $params[] =& $pos;
00551         $params[] =& $tagBeginPos;
00552         $result = $this->callOutputHandler( 'parsingHandler', $element, $params );
00553 
00554         if ( $result === false )
00555         {
00556             // This tag is already parsed in handler
00557             if ( !$noChildren )
00558             {
00559                 array_pop( $this->ParentStack );
00560             }
00561             return false;
00562         }
00563 
00564         if ( $this->QuitProcess )
00565         {
00566             return false;
00567         }
00568 
00569         // Process children
00570         if ( !$noChildren )
00571         {
00572             do
00573             {
00574                 $parseResult = $this->parseTag( $data, $pos, $element );
00575                 if ( $this->QuitProcess )
00576                 {
00577                     return false;
00578                 }
00579             }
00580             while( $parseResult !== true );
00581         }
00582 
00583         return false;
00584     }
00585 
00586     /*
00587         Helper functions for pass 1
00588     */
00589 
00590     function parseAttributes( $attributeString )
00591     {
00592         // Convert single quotes to double quotes
00593         $attributeString = preg_replace( "/ +([a-zA-Z0-9:-_#\-]+) *\='(.*?)'/e", "' \\1'.'=\"'.'\\2'.'\"'", ' ' . $attributeString );
00594 
00595         // Convert no quotes to double quotes and remove extra spaces
00596         $attributeString = preg_replace( "/ +([a-zA-Z0-9:-_#\-]+) *\= *([^\s'\"]+)/e", "' \\1'.'=\"'.'\\2'.'\" '", $attributeString );
00597 
00598         // Split by quotes followed by spaces
00599         $attributeArray = preg_split( "#(?<=\") +#", $attributeString );
00600 
00601         $attributes = array();
00602         foreach( $attributeArray as $attrStr )
00603         {
00604             if ( !$attrStr || strlen( $attrStr ) < 4 )
00605             {
00606                 continue;
00607             }
00608 
00609             list( $attrName, $attrValue ) = preg_split( "/ *= *\"/", $attrStr );
00610 
00611             $attrName = strtolower( trim( $attrName ) );
00612             if ( !$attrName )
00613             {
00614                 continue;
00615             }
00616 
00617             $attrValue = substr( $attrValue, 0, -1 );
00618             if ( $attrValue === '' || $attrValue === false )
00619             {
00620                 continue;
00621             }
00622 
00623             $attributes[$attrName] = $attrValue;
00624         }
00625 
00626         return $attributes;
00627     }
00628 
00629     function setAttributes( $element, $attributes )
00630     {
00631         $thisOutputTag = $this->OutputTags[$element->nodeName];
00632 
00633         foreach( $attributes as $key => $value )
00634         {
00635             // Convert attribute names
00636             if ( isset( $thisOutputTag['attributes'] ) &&
00637                  isset( $thisOutputTag['attributes'][$key] ) )
00638             {
00639                 $qualifiedName = $thisOutputTag['attributes'][$key];
00640             }
00641             else
00642             {
00643                 $qualifiedName = $key;
00644             }
00645 
00646             // Filter classes
00647             if ( $qualifiedName == 'class' )
00648             {
00649                 $classesList = $this->XMLSchema->getClassesList( $element->nodeName );
00650                 if ( !in_array( $value, $classesList ) )
00651                 {
00652                     $this->handleError( self::ERROR_DATA,
00653                                         ezi18n( 'kernel/classes/datatypes/ezxmltext', "Class '%1' is not allowed for element &lt;%2&gt; (check content.ini).",
00654                                         false, array( $value, $element->nodeName ) ) );
00655                     continue;
00656                 }
00657             }
00658 
00659             // Create attribute nodes
00660             if ( $qualifiedName )
00661             {
00662                 if ( strpos( $qualifiedName, ':' ) )
00663                 {
00664                     list( $prefix, $name ) = explode( ':', $qualifiedName );
00665                     if ( isset( $this->Namespaces[$prefix] ) )
00666                     {
00667                         $URI = $this->Namespaces[$prefix];
00668                         $element->setAttributeNS( $URI, $qualifiedName, $value );
00669                     }
00670                     else
00671                     {
00672                         eZDebug::writeWarning( "No namespace defined for prefix '$prefix'.", 'eZXML input parser' );
00673                     }
00674                 }
00675                 else
00676                 {
00677                     $element->setAttribute( $qualifiedName, $value );
00678                 }
00679             }
00680         }
00681 
00682         // Check for required attrs are present
00683         if ( isset( $this->OutputTags[$element->nodeName]['requiredInputAttributes'] ) )
00684         {
00685             foreach( $this->OutputTags[$element->nodeName]['requiredInputAttributes'] as $reqAttrName )
00686             {
00687                 $presented = false;
00688                 foreach( $attributes as $key => $value )
00689                 {
00690                     if ( $key == $reqAttrName )
00691                     {
00692                         $presented = true;
00693                         break;
00694                     }
00695                 }
00696                 if ( !$presented )
00697                 {
00698                     $this->handleError( self::ERROR_SCHEMA,
00699                                         ezi18n( 'kernel/classes/datatypes/ezxmltext', "Required attribute '%1' is not presented in tag &lt;%2&gt;.",
00700                                         false, array( $reqAttrName, $element->nodeName ) ) );
00701                 }
00702             }
00703         }
00704     }
00705 
00706     function washText( $textContent )
00707     {
00708         $textContent = $this->entitiesDecode( $textContent );
00709 
00710         if ( !$this->AllowNumericEntities )
00711         {
00712             $textContent = $this->convertNumericEntities( $textContent );
00713         }
00714 
00715         if ( !$this->AllowMultipleSpaces )
00716         {
00717             $textContent = preg_replace( "/ {2,}/", " ", $textContent );
00718         }
00719 
00720         return $textContent;
00721     }
00722 
00723     function entitiesDecode( $text )
00724     {
00725         $text = str_replace( '&#039;', "'", $text );
00726 
00727         $text = str_replace( '&gt;', '>', $text );
00728         $text = str_replace( '&lt;', '<', $text );
00729         $text = str_replace( '&apos;', "'", $text );
00730         $text = str_replace( '&quot;', '"', $text );
00731         $text = str_replace( '&amp;', '&', $text );
00732         $text = str_replace( '&nbsp;', ' ', $text );
00733         return $text;
00734     }
00735 
00736     function convertNumericEntities( $text )
00737     {
00738         if ( strlen( $text ) < 4 )
00739         {
00740             return $text;
00741         }
00742         // Convert other HTML entities to the current charset characters.
00743         //include_once( 'lib/ezi18n/classes/eztextcodec.php' );
00744         $codec = eZTextCodec::instance( 'unicode', false );
00745         $pos = 0;
00746         $domString = "";
00747         while ( $pos < strlen( $text ) - 1 )
00748         {
00749             $startPos = $pos;
00750             while( !( $text[$pos] == '&' && $text[$pos + 1] == '#' ) && $pos < strlen( $text ) - 1 )
00751             {
00752                 $pos++;
00753             }
00754 
00755             $domString .= substr( $text, $startPos, $pos - $startPos );
00756 
00757             if ( $pos < strlen( $text ) - 1 )
00758             {
00759                 $endPos = strpos( $text, ';', $pos + 2 );
00760                 if ( $endPos === false )
00761                 {
00762                     $convertedText .= '&#';
00763                     $pos += 2;
00764                     continue;
00765                 }
00766 
00767                 $code = substr( $text, $pos + 2, $endPos - ( $pos + 2 ) );
00768                 $char = $codec->convertString( array( $code ) );
00769 
00770                 $pos = $endPos + 1;
00771                 $domString .= $char;
00772             }
00773             else
00774             {
00775                 $domString .= substr( $text, $pos, 2 );
00776             }
00777         }
00778         return $domString;
00779     }
00780 
00781     /*!
00782      Returns modified attributes parameter
00783      */
00784     protected function wordMatchSupport( $newTagName, $attributes, $attributeString )
00785     {
00786         $ini = eZINI::instance( 'wordmatch.ini' );
00787         if ( $ini->hasVariable( $newTagName, 'MatchString' ) )
00788         {
00789             $matchArray = $ini->variable( $newTagName, 'MatchString' );
00790             if ( $matchArray )
00791             {
00792                 foreach ( array_keys( $matchArray ) as $key )
00793                 {
00794                     $matchString = $matchArray[$key];
00795                     if (  preg_match( "/$matchString/i", $attributeString ) )
00796                     {
00797                         $attributes['class'] = $key;
00798                         unset( $attributes['style'] );
00799                     }
00800                 }
00801             }
00802         }
00803         return $attributes;
00804     }
00805 
00806 
00807     /*!
00808         \public
00809         Pass 2: Process the tree, run handlers, rebuild and validate.
00810     */
00811 
00812     function performPass2()
00813     {
00814         $tmp = null;
00815 
00816         $this->processSubtree( $this->Document->documentElement, $tmp );
00817     }
00818 
00819     // main recursive function for pass 2
00820 
00821     function processSubtree( $element, &$lastHandlerResult )
00822     {
00823         $ret = null;
00824         $tmp = null;
00825 
00826         // Call "Init handler"
00827         $this->callOutputHandler( 'initHandler', $element, $tmp );
00828 
00829         // Process children
00830         if ( $element->hasChildNodes() )
00831         {
00832             // Make another copy of children to save primary structure
00833             $childNodes = $element->childNodes;
00834             $childrenCount = $childNodes->length;
00835 
00836             // we can not loop directly over the childNodes property, because this will change while we are working on it's parent's children
00837             $children = array();
00838             foreach ( $childNodes as $childNode )
00839             {
00840                 $children[] = $childNode;
00841             }
00842 
00843             $lastResult = null;
00844             $newElements = array();
00845             foreach ( $children as $child )
00846             {
00847                 eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', 'processing children, current child: ' . $child->nodeName );
00848                 $childReturn = $this->processSubtree( $child, $lastResult );
00849 
00850                 unset( $lastResult );
00851                 if ( isset( $childReturn['result'] ) )
00852                 {
00853                     eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', 'return result is set for child ' . $child->nodeName );
00854                     $lastResult = $childReturn['result'];
00855                 }
00856 
00857                 if ( isset( $childReturn['new_elements'] ) )
00858                 {
00859                     $newElements = array_merge( $newElements, $childReturn['new_elements'] );
00860                 }
00861 
00862                 if ( $this->QuitProcess )
00863                 {
00864                     return $ret;
00865                 }
00866             }
00867 
00868             eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'XML before processNewElements for element ' . $element->nodeName );
00869             // process elements created in children handlers
00870             $this->processNewElements( $newElements );
00871             eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'XML after processNewElements for element ' . $element->nodeName );
00872         }
00873 
00874         // Call "Structure handler"
00875         eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'XML before callOutputHandler structHandler for element ' . $element->nodeName );
00876         $ret = $this->callOutputHandler( 'structHandler', $element, $lastHandlerResult );
00877         eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'XML after callOutputHandler structHandler for element ' . $element->nodeName );
00878         eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $ret, 'return value of callOutputHandler structHandler for element ' . $element->nodeName );
00879 
00880         // Process by schema (check if element is allowed to exist)
00881         if ( !$this->processBySchemaPresence( $element ) )
00882         {
00883             eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'XML after processBySchemaPresence for element ' . $element->nodeName );
00884             return $ret;
00885         }
00886 
00887         eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'XML after processBySchemaPresence for element ' . $element->nodeName );
00888 
00889         // Process by schema (check place in the tree)
00890         if ( !$this->processBySchemaTree( $element ) )
00891         {
00892             eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'XML after processBySchemaTree for element ' . $element->nodeName );
00893             return $ret;
00894         }
00895 
00896         eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'XML after processBySchemaTree for element ' . $element->nodeName );
00897 
00898 
00899         $tmp = null;
00900         // Call "Publish handler"
00901         $this->callOutputHandler( 'publishHandler', $element, $tmp );
00902 
00903         // Process attributes according to the schema
00904         if ( $element->hasAttributes() )
00905         {
00906             if ( !$this->XMLSchema->hasAttributes( $element ) )
00907             {
00908                 eZXMLInputParser::removeAllAttributes( $element );
00909             }
00910             else
00911             {
00912                 $this->processAttributesBySchema( $element );
00913             }
00914         }
00915         return $ret;
00916     }
00917     /*
00918         Helper functions for pass 2
00919     */
00920 
00921     /*!
00922        Removes all attribute nodes from element node $element
00923     */
00924     function removeAllAttributes( DOMElement $element )
00925     {
00926         $attribs = $element->attributes;
00927         for ( $i = $attribs->length - 1; $i >= 0; $i-- )
00928         {
00929             $element->removeAttributeNode( $attribs->item( $i ) );
00930         }
00931     }
00932 
00933     // Check if the element is allowed to exist in this document and remove it if not.
00934     function processBySchemaPresence( $element )
00935     {
00936         $parent = $element->parentNode;
00937         if ( $parent instanceof DOMElement )
00938         {
00939             // If this is a foreign element, remove it
00940             if ( !$this->XMLSchema->exists( $element ) )
00941             {
00942                 if ( $element->nodeName == 'custom' )
00943                 {
00944                     $this->handleError( self::ERROR_SCHEMA,
00945                                         ezi18n( 'kernel/classes/datatypes/ezxmltext', "Custom tag '%1' is not allowed.",
00946                                         false, array( $element->getAttribute( 'name' ) ) ) );
00947                 }
00948                 $element = $parent->removeChild( $element );
00949                 return false;
00950             }
00951 
00952             // Delete if children required and no children
00953             // If this is an auto-added element, then do not throw error
00954 
00955             if ( $element->nodeType == XML_ELEMENT_NODE && ( $this->XMLSchema->childrenRequired( $element ) || $element->getAttribute( 'children_required' ) )
00956                  && !$element->hasChildNodes() )
00957             {
00958                 $element = $parent->removeChild( $element );
00959                 if ( !$element->getAttributeNS( 'http://ez.no/namespaces/ezpublish3/temporary/', 'new-element' ) )
00960                 {
00961                     $this->handleError( self::ERROR_SCHEMA, ezi18n( 'kernel/classes/datatypes/ezxmltext', "&lt;%1&gt; tag can't be empty.",
00962                                         false, array( $element->nodeName ) ) );
00963                     return false;
00964                 }
00965             }
00966         }
00967         // TODO: break processing of any node that doesn't have parent
00968         //       and is not a root node.
00969         elseif ( $element->nodeName != 'section' )
00970         {
00971             return false;
00972         }
00973         return true;
00974     }
00975 
00976     // Check that element has a correct position in the tree and fix it if not.
00977     function processBySchemaTree( $element )
00978     {
00979         $parent = $element->parentNode;
00980 
00981         if ( $parent instanceof DOMElement )
00982         {
00983             $schemaCheckResult = $this->XMLSchema->check( $parent, $element );
00984             if ( !$schemaCheckResult )
00985             {
00986                 if ( $schemaCheckResult === false )
00987                 {
00988                     // Remove indenting spaces
00989                     if ( $element->nodeType == XML_TEXT_NODE && !trim( $element->textContent ) )
00990                     {
00991                         $element = $parent->removeChild( $element );
00992                         return false;
00993                     }
00994 
00995                     $elementName = $element->nodeType == XML_ELEMENT_NODE ? '&lt;' . $element->nodeName . '&gt;' : $element->nodeName;
00996                     $this->handleError( self::ERROR_SCHEMA, ezi18n( 'kernel/classes/datatypes/ezxmltext', "%1 is not allowed to be a child of &lt;%2&gt;.",
00997                                         false, array( $elementName, $parent->nodeName ) ) );
00998                 }
00999                 $this->fixSubtree( $element, $element );
01000                 return false;
01001             }
01002         }
01003         // TODO: break processing of any node that doesn't have parent
01004         //       and is not a root node.
01005         elseif ( $element->nodeName != 'section' )
01006         {
01007             return false;
01008         }
01009         return true;
01010     }
01011 
01012     // Remove only nodes that don't match schema (recursively)
01013     function fixSubtree( $element, $mainChild )
01014     {
01015         $parent = $element->parentNode;
01016         $mainParent = $mainChild->parentNode;
01017         while ( $element->hasChildNodes() )
01018         {
01019             $child = $element->firstChild;
01020 
01021             $child = $element->removeChild( $child );
01022             $child = $mainParent->insertBefore( $child, $mainChild );
01023 
01024             if ( !$this->XMLSchema->check( $mainParent, $child ) )
01025             {
01026                 $this->fixSubtree( $child, $mainChild );
01027             }
01028         }
01029         $parent->removeChild( $element );
01030     }
01031 
01032     function processAttributesBySchema( $element )
01033     {
01034         // Remove attributes that don't match schema
01035         $schemaAttributes = $this->XMLSchema->attributes( $element );
01036         $schemaCustomAttributes = $this->XMLSchema->customAttributes( $element );
01037 
01038         $attributes = $element->attributes;
01039 
01040         for ( $i = $attributes->length - 1; $i >=0; $i-- )
01041         {
01042             $attr = $attributes->item( $i );
01043             if ( $attr->prefix == 'tmp' )
01044             {
01045                 $element->removeAttributeNode( $attr );
01046                 continue;
01047             }
01048 
01049             $allowed = false;
01050             $removeAttr = false;
01051 
01052             $fullName = $attr->prefix ? $attr->prefix . ':' . $attr->localName : $attr->nodeName;
01053 
01054             // check for allowed custom attributes (3.9)
01055             if ( $attr->prefix == 'custom' && in_array( $attr->localName, $schemaCustomAttributes ) )
01056             {
01057                 $allowed = true;
01058             }
01059             else
01060             {
01061                 if ( in_array( $fullName, $schemaAttributes ) )
01062                 {
01063                    $allowed = true;
01064                 }
01065                 elseif ( in_array( $fullName, $schemaCustomAttributes ) )
01066                 {
01067                     // add 'custom' prefix if it is not given
01068                     $allowed = true;
01069                     $removeAttr = true;
01070                     $element->setAttributeNS( $this->Namespaces['custom'], 'custom:' . $fullName, $attr->value );
01071                 }
01072             }
01073 
01074             if ( !$allowed )
01075             {
01076                 $removeAttr = true;
01077                 $this->handleError( self::ERROR_SCHEMA,
01078                                     ezi18n( 'kernel/classes/datatypes/ezxmltext', "Attribute '%1' is not allowed in &lt;%2&gt; element.",
01079                                     false, array( $fullName, $element->nodeName ) ) );
01080             }
01081             elseif ( $this->RemoveDefaultAttrs )
01082             {
01083                 // Remove attributes having default values
01084                 $default = $this->XMLSchema->attrDefaultValue( $element->nodeName, $fullName );
01085                 if ( $attr->value == $default )
01086                 {
01087                     $removeAttr = true;
01088                 }
01089             }
01090 
01091             if ( $removeAttr )
01092             {
01093                 $element->removeAttributeNode( $attr );
01094             }
01095         }
01096     }
01097 
01098     function callInputHandler( $handlerName, $tagName, &$attributes )
01099     {
01100         $result = null;
01101         $thisInputTag = $this->InputTags[$tagName];
01102         if ( isset( $thisInputTag[$handlerName] ) )
01103         {
01104             if ( is_callable( array( $this, $thisInputTag[$handlerName] ) ) )
01105             {
01106                 $result = call_user_func_array( array( $this, $thisInputTag[$handlerName] ),
01107                                                 array( $tagName, &$attributes ) );
01108             }
01109             else
01110             {
01111                 eZDebug::writeWarning( "'$handlerName' input handler for tag <$tagName> doesn't exist: '" . $thisInputTag[$handlerName] . "'.", 'eZXML input parser' );
01112             }
01113         }
01114         return $result;
01115     }
01116 
01117     function callOutputHandler( $handlerName, $element, &$params )
01118     {
01119         $result = null;
01120         $thisOutputTag = $this->OutputTags[$element->nodeName];
01121         if ( isset( $thisOutputTag[$handlerName] ) )
01122         {
01123             if ( is_callable( array( $this, $thisOutputTag[$handlerName] ) ) )
01124             {
01125                 $result = call_user_func_array( array( $this, $thisOutputTag[$handlerName] ),
01126                                                 array( $element, &$params ) );
01127             }
01128             else
01129             {
01130                 eZDebug::writeWarning( "'$handlerName' output handler for tag <$element->nodeName> doesn't exist: '" . $thisOutputTag[$handlerName] . "'.", 'eZXML input parser' );
01131             }
01132         }
01133 
01134         return $result;
01135     }
01136 
01137     // Creates new element and adds it to array for further post-processing.
01138     // Use this function if you need to process newly created element (check it by schema
01139     // and call 'structure' and 'publish' handlers)
01140     function createAndPublishElement( $elementName, &$ret )
01141     {
01142         $element = $this->Document->createElement( $elementName );
01143         $element->setAttributeNS( 'http://ez.no/namespaces/ezpublish3/temporary/', 'tmp:new-element', 'true' );
01144 
01145         if ( !isset( $ret['new_elements'] ) )
01146         {
01147             $ret['new_elements'] = array();
01148         }
01149 
01150         $ret['new_elements'][] = $element;
01151         return $element;
01152     }
01153 
01154     function processNewElements( $createdElements )
01155     {
01156         // Call handlers for newly created elements
01157         foreach ( $createdElements as $element )
01158         {
01159             eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', 'processing new element ' . $element->nodeName );
01160             $tmp = null;
01161 
01162             if ( !$this->processBySchemaPresence( $element ) )
01163             {
01164                 eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'xml string after processBySchemaPresence for new element ' . $element->nodeName );
01165                 continue;
01166             }
01167             eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'xml string after processBySchemaPresence for new element ' . $element->nodeName );
01168 
01169 
01170             // Call "Structure handler"
01171             $this->callOutputHandler( 'structHandler', $element, $tmp );
01172 
01173             if ( !$this->processBySchemaTree( $element ) )
01174             {
01175                 eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'xml string after processBySchemaTree for new element ' . $element->nodeName );
01176                 continue;
01177             }
01178             eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'xml string after processBySchemaTree for new element ' . $element->nodeName );
01179 
01180 
01181             $tmp2 = null;
01182             // Call "Publish handler"
01183             $this->callOutputHandler( 'publishHandler', $element, $tmp2 );
01184             eZDebugSetting::writeDebug( 'kernel-datatype-ezxmltext', $this->Document->saveXML(), 'xml string after callOutputHandler publishHandler for new element ' . $element->nodeName );
01185 
01186             // Process attributes according to the schema
01187             if( $element->hasAttributes() )
01188             {
01189                 if ( !$this->XMLSchema->hasAttributes( $element ) )
01190                 {
01191                     eZXMLInputParser::removeAllAttributes( $element );
01192                 }
01193                 else
01194                 {
01195                     $this->processAttributesBySchema( $element );
01196                 }
01197             }
01198         }
01199     }
01200 
01201     /// \public
01202     function getMessages()
01203     {
01204         return $this->Messages;
01205     }
01206 
01207     /// \public
01208     function isValid()
01209     {
01210         return $this->IsInputValid;
01211     }
01212 
01213     function handleError( $type, $message )
01214     {
01215         if ( $type & $this->DetectErrorLevel )
01216         {
01217             $this->IsInputValid = false;
01218             if ( $message )
01219             {
01220                 $this->Messages[] = $message;
01221             }
01222         }
01223 
01224         if ( $type & $this->ValidateErrorLevel )
01225         {
01226             $this->IsInputValid = false;
01227             $this->QuitProcess = true;
01228         }
01229     }
01230 
01231     public $DOMDocumentClass = 'DOMDOcument';
01232 
01233     public $XMLSchema;
01234     public $Document = null;
01235     public $Messages = array();
01236     public $eZPublishVersion;
01237 
01238     public $ParentStack = array();
01239 
01240     public $ValidateErrorLevel;
01241     public $DetectErrorLevel;
01242 
01243     public $IsInputValid = true;
01244     public $QuitProcess = false;
01245 
01246     // options that depend on settings
01247     public $TrimSpaces = true;
01248     public $AllowMultipleSpaces = false;
01249     public $AllowNumericEntities = false;
01250     public $StrictHeaders = false;
01251 
01252     // options that depend on parameters passed
01253     public $ParseLineBreaks = false;
01254     public $RemoveDefaultAttrs = false;
01255 }
01256 ?>