eZ Publish  [trunk]
ezxmlinputparser.php
Go to the documentation of this file.
00001 <?php
00002 /**
00003  * File containing the eZXMLInputParser class.
00004  *
00005  * @copyright Copyright (C) 1999-2012 eZ Systems AS. All rights reserved.
00006  * @license http://www.gnu.org/licenses/gpl-2.0.txt GNU General Public License v2
00007  * @version //autogentag//
00008  * @package kernel
00009  */
00010 
00011 /*
00012     Base class for the input parser.
00013     The goal of the parser is XML/HTML analyzing, fixing and transforming.
00014     The input is processed in 2 passes:
00015     - 1st pass: Parsing input, check for syntax errors, build DOM tree.
00016     - 2nd pass: Walking through DOM tree, checking validity by XML schema,
00017                 calling tag handlers to transform the tree.
00018 
00019     Both passes are controlled by the arrays described bellow and user handler functions.
00020 
00021 */
00022 
00023 // if ( !class_exists( 'eZXMLSchema' ) ) // AS 21-09-2007: commented out because of include_once being commented out
00024 class eZXMLInputParser
00025 {
00026     /// \deprecated (back-compatibility)
00027     const SHOW_NO_ERRORS = 0;
00028     const SHOW_SCHEMA_ERRORS = 1;
00029     const SHOW_ALL_ERRORS = 2;
00030 
00031     /// Use these constants for error types
00032     const ERROR_NONE = 0;
00033     const ERROR_SYNTAX = 4;
00034     const ERROR_SCHEMA = 8;
00035     const ERROR_DATA = 16;
00036     const ERROR_ALL = 28; // 4+8+16
00037 
00038     /* $InputTags array contains properties of elements that come from the input.
00039 
00040     Each array element describes a tag that comes from the input. Arrays index is
00041     a tag's name. Each element is an array that may contain the following members:
00042 
00043     'name'        - a string representing a new name of the tag,
00044     'nameHandler' - a name of the function that returns new tag name. Function format:
00045                     function tagNameHandler( $tagName, &$attributes )
00046 
00047     If no of those elements are defined the original tag's name is used.
00048 
00049     'noChildren'  - boolean value that determines if this tag could have child tags,
00050                     default value is false.
00051 
00052     Example:
00053 
00054     public $InputTags = array(
00055 
00056         'original-name' => array( 'name' => 'new-name' ),
00057 
00058         'original-name2' => array( 'nameHandler' => 'tagNameHandler',
00059                                    'noChildren' => true ),
00060 
00061          ...
00062 
00063          );
00064     */
00065 
00066     public $InputTags = array();
00067 
00068     /*
00069     $OutputTags array contains properties of elements that are produced in the output.
00070     Each array element describes a tag presented in the output. Arrays index is
00071     a tag's name. Each element is an array that may contain the following members:
00072 
00073     'parsingHandler' - "Parsing handler" called at parse pass 1 before processing tag's children.
00074     'initHandler'    - "Init handler" called at pass 2 before proccessing tag's children.
00075     'structHandler'  - "Structure handler" called at pass 2 after proccessing tag's children,
00076                        but before schema validity check. It can be used to implement structure
00077                        transformations.
00078     'publishHandler' - "Publish handler" called at pass 2 after schema validity check, so it is called
00079                        in case the element has it's guaranteed place in the DOM tree.
00080 
00081     'attributes'     - an array that describes attributes transformations. Array's index is the
00082                        original name of an attribute, and the value is the new name.
00083 
00084     'requiredInputAttributes' - attributes that are required in the input tag. If they are not presented
00085                                 it raises invalid input flag.
00086 
00087     Example:
00088 
00089     public $OutputTags = array(
00090 
00091         'custom'    => array( 'parsingHandler' => 'parsingHandlerCustom',
00092                               'initHandler' => 'initHandlerCustom',
00093                               'structHandler' => 'structHandlerCustom',
00094                               'publishHandler' => 'publishHandlerCustom',
00095                               'attributes' => array( 'title' => 'name' ) ),
00096 
00097         ...
00098     );
00099 
00100     */
00101 
00102     public $OutputTags = array();
00103 
00104     public $Namespaces = array( 'image' => 'http://ez.no/namespaces/ezpublish3/image/',
00105                              'xhtml' => 'http://ez.no/namespaces/ezpublish3/xhtml/',
00106                              'custom' => 'http://ez.no/namespaces/ezpublish3/custom/',
00107                              'tmp' => 'http://ez.no/namespaces/ezpublish3/temporary/' );
00108 
00109     /*!
00110 
00111     The constructor.
00112 
00113     \param $validate
00114     \param $validateErrorLevel Determines types of errors that break input processing
00115                                It's possible to combine any error types, by creating a bitmask of EZ_XMLINPUTPARSER_ERROR_* constants.
00116                                \c true value means that all errors defined by $detectErrorLevel parameter will break further processing
00117     \param $detectErrorLevel Determines types of errors that will be detected and added to error log ($Messages).
00118     */
00119 
00120     function eZXMLInputParser( $validateErrorLevel = self::ERROR_NONE, $detectErrorLevel = self::ERROR_NONE, $parseLineBreaks = false,
00121                                $removeDefaultAttrs = false )
00122     {
00123         // Back-compatibility fixes:
00124         if ( $detectErrorLevel === self::SHOW_SCHEMA_ERRORS )
00125         {
00126             $detectErrorLevel = self::ERROR_SCHEMA;
00127         }
00128         elseif ( $detectErrorLevel === self::SHOW_ALL_ERRORS )
00129         {
00130             $detectErrorLevel = self::ERROR_ALL;
00131         }
00132 
00133         if ( $validateErrorLevel === false )
00134         {
00135             $validateErrorLevel = self::ERROR_NONE;
00136         }
00137         elseif ( $validateErrorLevel === true )
00138         {
00139             $validateErrorLevel = $detectErrorLevel;
00140         }
00141 
00142         $this->ValidateErrorLevel = $validateErrorLevel;
00143         $this->DetectErrorLevel = $detectErrorLevel;
00144 
00145         $this->RemoveDefaultAttrs = $removeDefaultAttrs;
00146         $this->ParseLineBreaks = $parseLineBreaks;
00147 
00148         $this->XMLSchema = eZXMLSchema::instance();
00149 
00150         $this->eZPublishVersion = eZPublishSDK::majorVersion() + eZPublishSDK::minorVersion() * 0.1;
00151 
00152         $ini = eZINI::instance( 'ezxml.ini' );
00153         if ( $ini->hasVariable( 'InputSettings', 'TrimSpaces' ) )
00154         {
00155             $trimSpaces = $ini->variable( 'InputSettings', 'TrimSpaces' );
00156             $this->TrimSpaces = $trimSpaces == 'true' ? true : false;
00157         }
00158 
00159         if ( $ini->hasVariable( 'InputSettings', 'AllowMultipleSpaces' ) )
00160         {
00161             $allowMultipleSpaces = $ini->variable( 'InputSettings', 'AllowMultipleSpaces' );
00162             $this->AllowMultipleSpaces = $allowMultipleSpaces == 'true' ? true : false;
00163         }
00164 
00165         if ( $ini->hasVariable( 'InputSettings', 'AllowNumericEntities' ) )
00166         {
00167             $allowNumericEntities = $ini->variable( 'InputSettings', 'AllowNumericEntities' );
00168             $this->AllowNumericEntities = $allowNumericEntities == 'true' ? true : false;
00169         }
00170 
00171         $contentIni = eZINI::instance( 'content.ini' );
00172         $useStrictHeaderRule = $contentIni->variable( 'header', 'UseStrictHeaderRule' );
00173         $this->StrictHeaders = $useStrictHeaderRule == 'true' ? true : false;
00174     }
00175 
00176     /// \public
00177     function setDOMDocumentClass( $DOMDocumentClass )
00178     {
00179         $this->DOMDocumentClass = $DOMDocumentClass;
00180     }
00181 
00182     /// \public
00183     function setParseLineBreaks( $value )
00184     {
00185         $this->ParseLineBreaks = $value;
00186     }
00187 
00188     /// \public
00189     function setRemoveDefaultAttrs( $value )
00190     {
00191         $this->RemoveDefaultAttrs = $value;
00192     }
00193 
00194     /// \public
00195     function createRootNode()
00196     {
00197         if ( !$this->Document )
00198         {
00199             $this->Document = new $this->DOMDocumentClass( '1.0', 'utf-8' );
00200         }
00201 
00202         // Creating root section with namespaces definitions
00203         $mainSection = $this->Document->createElement( 'section' );
00204         $this->Document->appendChild( $mainSection );
00205         foreach( array( 'image', 'xhtml', 'custom' ) as $prefix )
00206         {
00207             $mainSection->setAttributeNS( 'http://www.w3.org/2000/xmlns/', 'xmlns:' . $prefix, $this->Namespaces[$prefix] );
00208         }
00209         return $this->Document;
00210     }
00211 
00212     /*!
00213         \public
00214         Call this function to process your input
00215     */
00216     function process( $text, $createRootNode = true )
00217     {
00218         $text = str_replace( "\r", '', $text);
00219         $text = str_replace( "\t", ' ', $text);
00220         // replace unicode chars that will break the XML validity
00221         // see http://www.w3.org/TR/REC-xml/#charsets
00222         $text = preg_replace( '/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', ' ', $text, -1, $count );
00223         if ( $count > 0 )
00224         {
00225             $this->Messages[] = ezpI18n::tr(
00226                 'kernel/classes/datatypes/ezxmltext',
00227                 "%count invalid character(s) have been found and replaced by a space",
00228                 false,
00229                 array( '%count' => $count )
00230             );
00231         }
00232         if ( !$this->ParseLineBreaks )
00233         {
00234             $text = str_replace( "\n", '', $text);
00235         }
00236 
00237         $this->Document = new $this->DOMDocumentClass( '1.0', 'utf-8' );
00238 
00239         if ( $createRootNode )
00240         {
00241             $this->createRootNode();
00242         }
00243 
00244         // Perform pass 1
00245         // Parsing the source string
00246         $this->performPass1( $text );
00247 
00248         //$this->Document->formatOutput = true;
00249         $debug = eZDebugSetting::isConditionTrue( 'kernel-datatype-ezxmltext', eZDebug::LEVEL_DEBUG );
00250         if ( $debug )
00251         {
00252             eZDebug::writeDebug( $this->Document->saveXML(), eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext', 'XML after pass 1' ) );
00253         }
00254 
00255         if ( $this->QuitProcess )
00256         {
00257             return false;
00258         }
00259 
00260         // Perform pass 2
00261         $this->performPass2();
00262 
00263         //$this->Document->formatOutput = true;
00264         if ( $debug )
00265         {
00266             eZDebug::writeDebug( $this->Document->saveXML(), eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext', 'XML after pass 2' ) );
00267         }
00268 
00269         if ( $this->QuitProcess )
00270         {
00271             return false;
00272         }
00273 
00274         return $this->Document;
00275     }
00276 
00277     /*
00278        \public
00279        Pass 1: Parsing the source HTML string.
00280     */
00281 
00282     function performPass1( &$data )
00283     {
00284         $ret = true;
00285         $pos = 0;
00286 
00287         if ( $this->Document->documentElement )
00288         {
00289             do
00290             {
00291                 $this->parseTag( $data, $pos, $this->Document->documentElement );
00292                 if ( $this->QuitProcess )
00293                 {
00294                     $ret = false;
00295                     break;
00296                 }
00297 
00298             }
00299             while( $pos < strlen( $data ) );
00300         }
00301         else
00302         {
00303             $tmp = null;
00304             $this->parseTag( $data, $pos, $tmp );
00305             if ( $this->QuitProcess )
00306             {
00307                 $ret = false;
00308             }
00309         }
00310         return $ret;
00311     }
00312 
00313     // The main recursive function for pass 1
00314 
00315     function parseTag( &$data, &$pos, &$parent )
00316     {
00317         // Find tag, determine it's type, name and attributes.
00318         $initialPos = $pos;
00319 
00320         if ( $pos >= strlen( $data ) )
00321         {
00322             return true;
00323         }
00324         $tagBeginPos = strpos( $data, '<', $pos );
00325 
00326         if ( $this->ParseLineBreaks )
00327         {
00328             // Regard line break as a start tag position
00329             $lineBreakPos = strpos( $data, "\n", $pos );
00330             if ( $lineBreakPos !== false )
00331             {
00332                 $tagBeginPos = $tagBeginPos === false ? $lineBreakPos : min( $tagBeginPos, $lineBreakPos );
00333             }
00334         }
00335 
00336         $tagName = '';
00337         $attributes = null;
00338         // If it doesn't begin with '<' then its a text node.
00339         if ( $tagBeginPos != $pos || $tagBeginPos === false )
00340         {
00341             $pos = $initialPos;
00342             $tagName = $newTagName = '#text';
00343             $noChildren = true;
00344 
00345             if ( !$tagBeginPos )
00346             {
00347                 $tagBeginPos = strlen( $data );
00348             }
00349 
00350             $textContent = substr( $data, $pos, $tagBeginPos - $pos );
00351 
00352             $textContent = $this->washText( $textContent );
00353 
00354             $pos = $tagBeginPos;
00355             if ( $textContent === '' )
00356             {
00357                 return false;
00358             }
00359         }
00360         // Process closing tag.
00361         elseif ( $data[$tagBeginPos] == '<' && $tagBeginPos + 1 < strlen( $data ) &&
00362                  $data[$tagBeginPos + 1] == '/' )
00363         {
00364             $tagEndPos = strpos( $data, '>', $tagBeginPos + 1 );
00365             if ( $tagEndPos === false )
00366             {
00367                 $pos = $tagBeginPos + 1;
00368 
00369                 $this->handleError( self::ERROR_SYNTAX, ezpI18n::tr( 'kernel/classes/datatypes/ezxmltext', 'Wrong closing tag' ) );
00370                 return false;
00371             }
00372 
00373             $pos = $tagEndPos + 1;
00374             $closedTagName = strtolower( trim( substr( $data, $tagBeginPos + 2, $tagEndPos - $tagBeginPos - 2 ) ) );
00375 
00376             // Find matching tag in ParentStack array
00377             $firstLoop = true;
00378             for( $i = count( $this->ParentStack ) - 1; $i >= 0; $i-- )
00379             {
00380                 $parentNames = $this->ParentStack[$i];
00381                 if ( $parentNames[0] == $closedTagName )
00382                 {
00383                     array_pop( $this->ParentStack );
00384                     if ( !$firstLoop )
00385                     {
00386                         $pos = $tagBeginPos;
00387                         return true;
00388                     }
00389                     // If newTagName was '' we don't break children loop
00390                     elseif ( $parentNames[1] !== '' )
00391                     {
00392                         return true;
00393                     }
00394                     else
00395                     {
00396                         return false;
00397                     }
00398                 }
00399                 $firstLoop = false;
00400             }
00401 
00402             $this->handleError( self::ERROR_SYNTAX, ezpI18n::tr( 'kernel/classes/datatypes/ezxmltext', 'Wrong closing tag : &lt;/%1&gt;.', false, array( $closedTagName ) ) );
00403 
00404             return false;
00405         }
00406         // Insert <br/> instead of linebreaks
00407         elseif ( $this->ParseLineBreaks && $data[$tagBeginPos] == "\n" )
00408         {
00409             $newTagName = 'br';
00410             $noChildren = true;
00411             $pos = $tagBeginPos + 1;
00412         }
00413         //  Regular tag: get tag's name and attributes.
00414         else
00415         {
00416             $tagEndPos = strpos( $data, '>', $tagBeginPos );
00417             if ( $tagEndPos === false )
00418             {
00419                 $pos = $tagBeginPos + 1;
00420 
00421                 $this->handleError( self::ERROR_SYNTAX, ezpI18n::tr( 'kernel/classes/datatypes/ezxmltext', 'Wrong opening tag' ) );
00422                 return false;
00423             }
00424 
00425             $pos = $tagEndPos + 1;
00426             $tagString = substr( $data, $tagBeginPos + 1, $tagEndPos - $tagBeginPos - 1 );
00427             // Check for final backslash
00428             $noChildren = substr( $tagString, -1, 1 ) == '/' ? true : false;
00429             // Remove final backslash and spaces
00430             $tagString = preg_replace( "/\s*\/$/", "", $tagString );
00431 
00432             $firstSpacePos = strpos( $tagString, ' ' );
00433             if ( $firstSpacePos === false )
00434             {
00435                 $tagName = strtolower( trim( $tagString ) );
00436                 $attributeString = '';
00437             }
00438             else
00439             {
00440                 $tagName = strtolower( substr( $tagString, 0, $firstSpacePos ) );
00441                 $attributeString = substr( $tagString, $firstSpacePos + 1 );
00442                 $attributeString = trim( $attributeString );
00443                 // Parse attribute string
00444                 if ( $attributeString )
00445                 {
00446                     $attributes = $this->parseAttributes( $attributeString );
00447                 }
00448             }
00449 
00450             // Determine tag's name
00451             if ( isset( $this->InputTags[$tagName] ) )
00452             {
00453                 $thisInputTag = $this->InputTags[$tagName];
00454 
00455                 if ( isset( $thisInputTag['name'] ) )
00456                 {
00457                     $newTagName = $thisInputTag['name'];
00458                 }
00459                 else
00460                 {
00461                     $newTagName = $this->callInputHandler( 'nameHandler', $tagName, $attributes );
00462                 }
00463             }
00464             else
00465             {
00466                 if ( $this->XMLSchema->exists( $tagName ) )
00467                 {
00468                     $newTagName = $tagName;
00469                 }
00470                 else
00471                 {
00472                     $this->handleError( self::ERROR_SYNTAX, ezpI18n::tr( 'kernel/classes/datatypes/ezxmltext', 'Unknown tag: &lt;%1&gt;.', false, array( $tagName ) ) );
00473                     return false;
00474                 }
00475             }
00476 
00477             // Check 'noChildren' property
00478             if ( isset( $thisInputTag['noChildren'] ) )
00479             {
00480                 $noChildren = true;
00481             }
00482 
00483             $thisOutputTag = isset( $this->OutputTags[$newTagName] ) ? $this->OutputTags[$newTagName] : null;
00484 
00485             // Implementation of 'autoCloseOn' rule ( Handling of unclosed tags, ex.: <p>, <li> )
00486             if ( isset( $thisOutputTag['autoCloseOn'] ) &&
00487                  $parent &&
00488                  $parent->parentNode instanceof DOMElement &&
00489                  in_array( $parent->nodeName, $thisOutputTag['autoCloseOn'] ) )
00490             {
00491                 // Wrong nesting: auto-close parent and try to re-parse this tag at higher level
00492                 array_pop( $this->ParentStack );
00493                 $pos = $tagBeginPos;
00494                 return true;
00495             }
00496 
00497             // Append to parent stack
00498             if ( !$noChildren && $newTagName !== false )
00499             {
00500                 $this->ParentStack[] = array( $tagName, $newTagName, $attributeString );
00501             }
00502 
00503             if ( !$newTagName )
00504             {
00505                 // If $newTagName is an empty string then it's not a error
00506                 if ( $newTagName === false )
00507                     $this->handleError( self::ERROR_SYNTAX, ezpI18n::tr( 'kernel/classes/datatypes/ezxmltext', "Can't convert tag's name: &lt;%1&gt;.", false, array( $tagName ) ) );
00508 
00509                 return false;
00510             }
00511 
00512             // wordmatch.ini support
00513             if ( $attributeString )
00514             {
00515                 $attributes = $this->wordMatchSupport( $newTagName, $attributes, $attributeString );
00516             }
00517         }
00518 
00519         // Create text or normal node.
00520         if ( $newTagName == '#text' )
00521         {
00522             $element = $this->Document->createTextNode( $textContent );
00523         }
00524         else
00525         {
00526             $element = $this->Document->createElement( $newTagName );
00527         }
00528 
00529         if ( $attributes )
00530         {
00531             $this->setAttributes( $element, $attributes );
00532         }
00533 
00534         // Append element as a child or set it as root if there is no parent.
00535         if ( $parent )
00536         {
00537             $parent->appendChild( $element );
00538         }
00539         else
00540         {
00541             $this->Document->appendChild( $element );
00542         }
00543 
00544         $params = array();
00545         $params[] =& $data;
00546         $params[] =& $pos;
00547         $params[] =& $tagBeginPos;
00548         $result = $this->callOutputHandler( 'parsingHandler', $element, $params );
00549 
00550         if ( $result === false )
00551         {
00552             // This tag is already parsed in handler
00553             if ( !$noChildren )
00554             {
00555                 array_pop( $this->ParentStack );
00556             }
00557             return false;
00558         }
00559 
00560         if ( $this->QuitProcess )
00561         {
00562             return false;
00563         }
00564 
00565         // Process children
00566         if ( !$noChildren )
00567         {
00568             do
00569             {
00570                 $parseResult = $this->parseTag( $data, $pos, $element );
00571                 if ( $this->QuitProcess )
00572                 {
00573                     return false;
00574                 }
00575             }
00576             while( $parseResult !== true );
00577         }
00578 
00579         return false;
00580     }
00581 
00582     /*
00583         Helper functions for pass 1
00584     */
00585 
00586     function parseAttributes( $attributeString )
00587     {
00588         $attributes = array();
00589         // Valid characters for XML attributes
00590         // @see http://www.w3.org/TR/xml/#NT-Name
00591         $nameStartChar = ':A-Z_a-z\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\x{2FF}\\x{370}-\\x{37D}\\x{37F}-\\x{1FFF}\\x{200C}-\\x{200D}\\x{2070}-\\x{218F}\\x{2C00}-\\x{2FEF}\\x{3001}-\\x{D7FF}\\x{F900}-\\x{FDCF}\\x{FDF0}-\\x{FFFD}\\x{10000}-\\x{EFFFF}';
00592         if (
00593             preg_match_all(
00594                 "/\s+([$nameStartChar][$nameStartChar\-.0-9\\xB7\\x{0300}-\\x{036F}\\x{203F}-\\x{2040}]*)\s*=\s*(?:(?:\"([^\"]+?)\")|(?:'([^']+?)')|(?: *([^\"'\s]+)\s*))/u",
00595                 " " . $attributeString,
00596                 $attributeArray,
00597                 PREG_SET_ORDER
00598             )
00599         ) {
00600             foreach ( $attributeArray as $attribute )
00601             {
00602                 // Value will always be at the last position
00603                 $value = trim( array_pop( $attribute ) );
00604                 // Value of '0' is valid ( eg. border='0' )
00605                 if ( $value !== '' && $value !== false && $value !== null )
00606                 {
00607                     $attributes[strtolower( $attribute[1] )] = $value;
00608                 }
00609             }
00610         }
00611 
00612         return $attributes;
00613     }
00614 
00615     function setAttributes( $element, $attributes )
00616     {
00617         $thisOutputTag = $this->OutputTags[$element->nodeName];
00618 
00619         foreach( $attributes as $key => $value )
00620         {
00621             // Convert attribute names
00622             if ( isset( $thisOutputTag['attributes'] ) &&
00623                  isset( $thisOutputTag['attributes'][$key] ) )
00624             {
00625                 $qualifiedName = $thisOutputTag['attributes'][$key];
00626             }
00627             else
00628             {
00629                 $qualifiedName = $key;
00630             }
00631 
00632             // Filter classes
00633             if ( $qualifiedName == 'class' )
00634             {
00635                 $classesList = $this->XMLSchema->getClassesList( $element->nodeName );
00636                 if ( !in_array( $value, $classesList ) )
00637                 {
00638                     $this->handleError( self::ERROR_DATA,
00639                                         ezpI18n::tr( 'kernel/classes/datatypes/ezxmltext', "Class '%1' is not allowed for element &lt;%2&gt; (check content.ini).",
00640                                         false, array( $value, $element->nodeName ) ) );
00641                     continue;
00642                 }
00643             }
00644 
00645             // Create attribute nodes
00646             if ( $qualifiedName )
00647             {
00648                 if ( strpos( $qualifiedName, ':' ) )
00649                 {
00650                     list( $prefix, $name ) = explode( ':', $qualifiedName );
00651                     if ( isset( $this->Namespaces[$prefix] ) )
00652                     {
00653                         $URI = $this->Namespaces[$prefix];
00654                         $element->setAttributeNS( $URI, $qualifiedName, $value );
00655                     }
00656                     else
00657                     {
00658                         eZDebug::writeWarning( "No namespace defined for prefix '$prefix'.", 'eZXML input parser' );
00659                     }
00660                 }
00661                 else
00662                 {
00663                     $element->setAttribute( $qualifiedName, $value );
00664                 }
00665             }
00666         }
00667 
00668         // Check for required attrs are present
00669         if ( isset( $this->OutputTags[$element->nodeName]['requiredInputAttributes'] ) )
00670         {
00671             foreach( $this->OutputTags[$element->nodeName]['requiredInputAttributes'] as $reqAttrName )
00672             {
00673                 $presented = false;
00674                 foreach( $attributes as $key => $value )
00675                 {
00676                     if ( $key == $reqAttrName )
00677                     {
00678                         $presented = true;
00679                         break;
00680                     }
00681                 }
00682                 if ( !$presented )
00683                 {
00684                     $this->handleError( self::ERROR_SCHEMA,
00685                                         ezpI18n::tr( 'kernel/classes/datatypes/ezxmltext', "Required attribute '%1' is not presented in tag &lt;%2&gt;.",
00686                                         false, array( $reqAttrName, $element->nodeName ) ) );
00687                 }
00688             }
00689         }
00690     }
00691 
00692     function washText( $textContent )
00693     {
00694         $textContent = $this->entitiesDecode( $textContent );
00695 
00696         if ( !$this->AllowNumericEntities )
00697         {
00698             $textContent = $this->convertNumericEntities( $textContent );
00699         }
00700 
00701         if ( !$this->AllowMultipleSpaces )
00702         {
00703             $textContent = preg_replace( "/ {2,}/", " ", $textContent );
00704         }
00705 
00706         return $textContent;
00707     }
00708 
00709     function entitiesDecode( $text )
00710     {
00711         $text = str_replace( '&#039;', "'", $text );
00712 
00713         $text = str_replace( '&gt;', '>', $text );
00714         $text = str_replace( '&lt;', '<', $text );
00715         $text = str_replace( '&apos;', "'", $text );
00716         $text = str_replace( '&quot;', '"', $text );
00717         $text = str_replace( '&amp;', '&', $text );
00718         return $text;
00719     }
00720 
00721     function convertNumericEntities( $text )
00722     {
00723         if ( strlen( $text ) < 4 )
00724         {
00725             return $text;
00726         }
00727         // Convert other HTML entities to the current charset characters.
00728         $codec = eZTextCodec::instance( 'unicode', false );
00729         $pos = 0;
00730         $domString = "";
00731         while ( $pos < strlen( $text ) - 1 )
00732         {
00733             $startPos = $pos;
00734             while( !( $text[$pos] == '&' && $text[$pos + 1] == '#' ) && $pos < strlen( $text ) - 1 )
00735             {
00736                 $pos++;
00737             }
00738 
00739             $domString .= substr( $text, $startPos, $pos - $startPos );
00740 
00741             if ( $pos < strlen( $text ) - 1 )
00742             {
00743                 $endPos = strpos( $text, ';', $pos + 2 );
00744                 if ( $endPos === false )
00745                 {
00746                     $convertedText .= '&#';
00747                     $pos += 2;
00748                     continue;
00749                 }
00750 
00751                 $code = substr( $text, $pos + 2, $endPos - ( $pos + 2 ) );
00752                 $char = $codec->convertString( array( $code ) );
00753 
00754                 $pos = $endPos + 1;
00755                 $domString .= $char;
00756             }
00757             else
00758             {
00759                 $domString .= substr( $text, $pos, 2 );
00760             }
00761         }
00762         return $domString;
00763     }
00764 
00765     /*!
00766      Returns modified attributes parameter
00767      */
00768     protected function wordMatchSupport( $newTagName, $attributes, $attributeString )
00769     {
00770         $ini = eZINI::instance( 'wordmatch.ini' );
00771         if ( $ini->hasVariable( $newTagName, 'MatchString' ) )
00772         {
00773             $matchArray = $ini->variable( $newTagName, 'MatchString' );
00774             if ( $matchArray )
00775             {
00776                 foreach ( array_keys( $matchArray ) as $key )
00777                 {
00778                     $matchString = $matchArray[$key];
00779                     if (  preg_match( "/$matchString/i", $attributeString ) )
00780                     {
00781                         $attributes['class'] = $key;
00782                         unset( $attributes['style'] );
00783                     }
00784                 }
00785             }
00786         }
00787         return $attributes;
00788     }
00789 
00790 
00791     /*!
00792         \public
00793         Pass 2: Process the tree, run handlers, rebuild and validate.
00794     */
00795 
00796     function performPass2()
00797     {
00798         $tmp = null;
00799 
00800         $this->processSubtree( $this->Document->documentElement, $tmp );
00801     }
00802 
00803     // main recursive function for pass 2
00804 
00805     function processSubtree( $element, &$lastHandlerResult )
00806     {
00807         $ret = null;
00808         $tmp = null;
00809 
00810         // Call "Init handler"
00811         $this->callOutputHandler( 'initHandler', $element, $tmp );
00812 
00813         $debug = eZDebugSetting::isConditionTrue( 'kernel-datatype-ezxmltext', eZDebug::LEVEL_DEBUG );
00814 
00815         // Process children
00816         if ( $element->hasChildNodes() )
00817         {
00818             // Make another copy of children to save primary structure
00819             $childNodes = $element->childNodes;
00820             $childrenCount = $childNodes->length;
00821 
00822             // we can not loop directly over the childNodes property, because this will change while we are working on it's parent's children
00823             $children = array();
00824             foreach ( $childNodes as $childNode )
00825             {
00826                 $children[] = $childNode;
00827             }
00828 
00829             $lastResult = null;
00830             $newElements = array();
00831             foreach ( $children as $child )
00832             {
00833                 if ( $debug )
00834                 {
00835                     eZDebug::writeDebug( 'processing children, current child: ' . $child->nodeName, eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext', __METHOD__ ) );
00836                 }
00837 
00838                 $childReturn = $this->processSubtree( $child, $lastResult );
00839 
00840                 unset( $lastResult );
00841                 if ( isset( $childReturn['result'] ) )
00842                 {
00843                     if ( $debug )
00844                     {
00845                         eZDebug::writeDebug( 'return result is set for child ' . $child->nodeName, eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext', __METHOD__ ) );
00846                     }
00847 
00848                     $lastResult = $childReturn['result'];
00849                 }
00850 
00851                 if ( isset( $childReturn['new_elements'] ) )
00852                 {
00853                     $newElements = array_merge( $newElements, $childReturn['new_elements'] );
00854                 }
00855 
00856                 if ( $this->QuitProcess )
00857                 {
00858                     return $ret;
00859                 }
00860             }
00861 
00862             if ( $debug )
00863             {
00864                 eZDebug::writeDebug( $this->Document->saveXML(),
00865                                      eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
00866                                                                   'XML before processNewElements for element ' . $element->nodeName ) );
00867             }
00868 
00869             // process elements created in children handlers
00870             $this->processNewElements( $newElements );
00871 
00872             if ( $debug )
00873             {
00874                 eZDebug::writeDebug( $this->Document->saveXML(),
00875                                      eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
00876                                                                   'XML after processNewElements for element ' . $element->nodeName ) );
00877             }
00878         }
00879 
00880         // Call "Structure handler"
00881         if ( $debug )
00882         {
00883             eZDebug::writeDebug( $this->Document->saveXML(),
00884                                  eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
00885                                                               'XML before callOutputHandler structHandler for element ' . $element->nodeName ) );
00886         }
00887 
00888         $ret = $this->callOutputHandler( 'structHandler', $element, $lastHandlerResult );
00889 
00890         if ( $debug )
00891         {
00892             eZDebug::writeDebug( $this->Document->saveXML(),
00893                                  eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
00894                                                               'XML after callOutputHandler structHandler for element ' . $element->nodeName ) );
00895             eZDebug::writeDebug( $ret,
00896                                  eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
00897                                                               'return value of callOutputHandler structHandler for element ' . $element->nodeName ) );
00898         }
00899 
00900         // Process by schema (check if element is allowed to exist)
00901         if ( !$this->processBySchemaPresence( $element ) )
00902         {
00903             if ( $debug )
00904             {
00905                 eZDebug::writeDebug( $this->Document->saveXML(),
00906                                      eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
00907                                                                   'XML after failed processBySchemaPresence for element ' . $element->nodeName ) );
00908             }
00909             return $ret;
00910         }
00911 
00912         if ( $debug )
00913         {
00914             eZDebug::writeDebug( $this->Document->saveXML(),
00915                                  eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
00916                                                               'XML after processBySchemaPresence for element ' . $element->nodeName ) );
00917         }
00918 
00919         // Process by schema (check place in the tree)
00920         if ( !$this->processBySchemaTree( $element ) )
00921         {
00922             if ( $debug )
00923             {
00924                 eZDebug::writeDebug( $this->Document->saveXML(),
00925                                      eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
00926                                                                   'XML after failed processBySchemaTree for element ' . $element->nodeName ) );
00927             }
00928             return $ret;
00929         }
00930 
00931         if ( $debug )
00932         {
00933             eZDebug::writeDebug( $this->Document->saveXML(),
00934                                  eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
00935                                                               'XML after processBySchemaTree for element ' . $element->nodeName ) );
00936         }
00937 
00938 
00939         $tmp = null;
00940         // Call "Publish handler"
00941         $this->callOutputHandler( 'publishHandler', $element, $tmp );
00942 
00943         // Process attributes according to the schema
00944         if ( $element->hasAttributes() )
00945         {
00946             if ( !$this->XMLSchema->hasAttributes( $element ) )
00947             {
00948                 eZXMLInputParser::removeAllAttributes( $element );
00949             }
00950             else
00951             {
00952                 $this->processAttributesBySchema( $element );
00953             }
00954         }
00955         return $ret;
00956     }
00957     /*
00958         Helper functions for pass 2
00959     */
00960 
00961     /*!
00962        Removes all attribute nodes from element node $element
00963     */
00964     function removeAllAttributes( DOMElement $element )
00965     {
00966         $attribs = $element->attributes;
00967         for ( $i = $attribs->length - 1; $i >= 0; $i-- )
00968         {
00969             $element->removeAttributeNode( $attribs->item( $i ) );
00970         }
00971     }
00972 
00973     // Check if the element is allowed to exist in this document and remove it if not.
00974     function processBySchemaPresence( $element )
00975     {
00976         $parent = $element->parentNode;
00977         if ( $parent instanceof DOMElement )
00978         {
00979             // If this is a foreign element, remove it
00980             if ( !$this->XMLSchema->exists( $element ) )
00981             {
00982                 if ( $element->nodeName == 'custom' )
00983                 {
00984                     $this->handleError( self::ERROR_SCHEMA,
00985                                         ezpI18n::tr( 'kernel/classes/datatypes/ezxmltext', "Custom tag '%1' is not allowed.",
00986                                         false, array( $element->getAttribute( 'name' ) ) ) );
00987                 }
00988                 $element = $parent->removeChild( $element );
00989                 return false;
00990             }
00991 
00992             // Delete if children required and no children
00993             // If this is an auto-added element, then do not throw error
00994 
00995             if ( $element->nodeType == XML_ELEMENT_NODE && ( $this->XMLSchema->childrenRequired( $element ) || $element->getAttribute( 'children_required' ) )
00996                  && !$element->hasChildNodes() )
00997             {
00998                 $element = $parent->removeChild( $element );
00999                 if ( !$element->getAttributeNS( 'http://ez.no/namespaces/ezpublish3/temporary/', 'new-element' ) )
01000                 {
01001                     $this->handleError( self::ERROR_SCHEMA, ezpI18n::tr( 'kernel/classes/datatypes/ezxmltext', "&lt;%1&gt; tag can't be empty.",
01002                                         false, array( $element->nodeName ) ) );
01003                     return false;
01004                 }
01005             }
01006         }
01007         // TODO: break processing of any node that doesn't have parent
01008         //       and is not a root node.
01009         elseif ( $element->nodeName != 'section' )
01010         {
01011             return false;
01012         }
01013         return true;
01014     }
01015 
01016     // Check that element has a correct position in the tree and fix it if not.
01017     function processBySchemaTree( $element )
01018     {
01019         $parent = $element->parentNode;
01020 
01021         if ( $parent instanceof DOMElement )
01022         {
01023             $schemaCheckResult = $this->XMLSchema->check( $parent, $element );
01024             if ( !$schemaCheckResult )
01025             {
01026                 if ( $schemaCheckResult === false )
01027                 {
01028                     // Remove indenting spaces
01029                     if ( $element->nodeType == XML_TEXT_NODE && !trim( $element->textContent ) )
01030                     {
01031                         $element = $parent->removeChild( $element );
01032                         return false;
01033                     }
01034 
01035                     $elementName = $element->nodeType == XML_ELEMENT_NODE ? '&lt;' . $element->nodeName . '&gt;' : $element->nodeName;
01036                     $this->handleError( self::ERROR_SCHEMA, ezpI18n::tr( 'kernel/classes/datatypes/ezxmltext', "%1 is not allowed to be a child of &lt;%2&gt;.",
01037                                         false, array( $elementName, $parent->nodeName ) ) );
01038                 }
01039                 $this->fixSubtree( $element, $element );
01040                 return false;
01041             }
01042         }
01043         // TODO: break processing of any node that doesn't have parent
01044         //       and is not a root node.
01045         elseif ( $element->nodeName != 'section' )
01046         {
01047             return false;
01048         }
01049         return true;
01050     }
01051 
01052     // Remove only nodes that don't match schema (recursively)
01053     function fixSubtree( $element, $mainChild )
01054     {
01055         $parent = $element->parentNode;
01056         $mainParent = $mainChild->parentNode;
01057         while ( $element->hasChildNodes() )
01058         {
01059             $child = $element->firstChild;
01060 
01061             $child = $element->removeChild( $child );
01062             $child = $mainParent->insertBefore( $child, $mainChild );
01063 
01064             if ( !$this->XMLSchema->check( $mainParent, $child ) )
01065             {
01066                 $this->fixSubtree( $child, $mainChild );
01067             }
01068         }
01069         $parent->removeChild( $element );
01070     }
01071 
01072     function processAttributesBySchema( $element )
01073     {
01074         // Remove attributes that don't match schema
01075         $schemaAttributes = $this->XMLSchema->attributes( $element );
01076         $schemaCustomAttributes = $this->XMLSchema->customAttributes( $element );
01077 
01078         $attributes = $element->attributes;
01079 
01080         for ( $i = $attributes->length - 1; $i >=0; $i-- )
01081         {
01082             $attr = $attributes->item( $i );
01083             if ( $attr->prefix == 'tmp' )
01084             {
01085                 $element->removeAttributeNode( $attr );
01086                 continue;
01087             }
01088 
01089             $allowed = false;
01090             $removeAttr = false;
01091 
01092             $fullName = $attr->prefix ? $attr->prefix . ':' . $attr->localName : $attr->nodeName;
01093 
01094             // check for allowed custom attributes (3.9)
01095             if ( $attr->prefix == 'custom' && in_array( $attr->localName, $schemaCustomAttributes ) )
01096             {
01097                 $allowed = true;
01098             }
01099             else
01100             {
01101                 if ( in_array( $fullName, $schemaAttributes ) )
01102                 {
01103                    $allowed = true;
01104                 }
01105                 elseif ( in_array( $fullName, $schemaCustomAttributes ) )
01106                 {
01107                     // add 'custom' prefix if it is not given
01108                     $allowed = true;
01109                     $removeAttr = true;
01110                     $element->setAttributeNS( $this->Namespaces['custom'], 'custom:' . $fullName, $attr->value );
01111                 }
01112             }
01113 
01114             if ( !$allowed )
01115             {
01116                 $removeAttr = true;
01117                 $this->handleError( self::ERROR_SCHEMA,
01118                                     ezpI18n::tr( 'kernel/classes/datatypes/ezxmltext', "Attribute '%1' is not allowed in &lt;%2&gt; element.",
01119                                     false, array( $fullName, $element->nodeName ) ) );
01120             }
01121             elseif ( $this->RemoveDefaultAttrs )
01122             {
01123                 // Remove attributes having default values
01124                 $default = $this->XMLSchema->attrDefaultValue( $element->nodeName, $fullName );
01125                 if ( $attr->value == $default )
01126                 {
01127                     $removeAttr = true;
01128                 }
01129             }
01130 
01131             if ( $removeAttr )
01132             {
01133                 $element->removeAttributeNode( $attr );
01134             }
01135         }
01136     }
01137 
01138     function callInputHandler( $handlerName, $tagName, &$attributes )
01139     {
01140         $result = null;
01141         $thisInputTag = $this->InputTags[$tagName];
01142         if ( isset( $thisInputTag[$handlerName] ) )
01143         {
01144             if ( is_callable( array( $this, $thisInputTag[$handlerName] ) ) )
01145             {
01146                 $result = call_user_func_array( array( $this, $thisInputTag[$handlerName] ),
01147                                                 array( $tagName, &$attributes ) );
01148             }
01149             else
01150             {
01151                 eZDebug::writeWarning( "'$handlerName' input handler for tag <$tagName> doesn't exist: '" . $thisInputTag[$handlerName] . "'.", 'eZXML input parser' );
01152             }
01153         }
01154         return $result;
01155     }
01156 
01157     function callOutputHandler( $handlerName, $element, &$params )
01158     {
01159         $result = null;
01160         $thisOutputTag = $this->OutputTags[$element->nodeName];
01161         if ( isset( $thisOutputTag[$handlerName] ) )
01162         {
01163             if ( is_callable( array( $this, $thisOutputTag[$handlerName] ) ) )
01164             {
01165                 $result = call_user_func_array( array( $this, $thisOutputTag[$handlerName] ),
01166                                                 array( $element, &$params ) );
01167             }
01168             else
01169             {
01170                 eZDebug::writeWarning( "'$handlerName' output handler for tag <$element->nodeName> doesn't exist: '" . $thisOutputTag[$handlerName] . "'.", 'eZXML input parser' );
01171             }
01172         }
01173 
01174         return $result;
01175     }
01176 
01177     // Creates new element and adds it to array for further post-processing.
01178     // Use this function if you need to process newly created element (check it by schema
01179     // and call 'structure' and 'publish' handlers)
01180     function createAndPublishElement( $elementName, &$ret )
01181     {
01182         $element = $this->Document->createElement( $elementName );
01183         $element->setAttributeNS( 'http://ez.no/namespaces/ezpublish3/temporary/', 'tmp:new-element', 'true' );
01184 
01185         if ( !isset( $ret['new_elements'] ) )
01186         {
01187             $ret['new_elements'] = array();
01188         }
01189 
01190         $ret['new_elements'][] = $element;
01191         return $element;
01192     }
01193 
01194     function processNewElements( $createdElements )
01195     {
01196         $debug = eZDebugSetting::isConditionTrue( 'kernel-datatype-ezxmltext', eZDebug::LEVEL_DEBUG );
01197         // Call handlers for newly created elements
01198         foreach ( $createdElements as $element )
01199         {
01200             if ( $debug )
01201             {
01202                 eZDebug::writeDebug( 'processing new element ' . $element->nodeName, eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext' ) );
01203             }
01204 
01205             $tmp = null;
01206             if ( !$this->processBySchemaPresence( $element ) )
01207             {
01208                 if ( $debug )
01209                 {
01210                     eZDebug::writeDebug( $this->Document->saveXML(),
01211                                          eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
01212                                                                       'xml string after failed processBySchemaPresence for new element ' . $element->nodeName ) );
01213                 }
01214                 continue;
01215             }
01216 
01217             if ( $debug )
01218             {
01219                 eZDebug::writeDebug( $this->Document->saveXML(),
01220                                      eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
01221                                                                   'xml string after processBySchemaPresence for new element ' . $element->nodeName ) );
01222             }
01223 
01224 
01225             // Call "Structure handler"
01226             $this->callOutputHandler( 'structHandler', $element, $tmp );
01227 
01228             if ( !$this->processBySchemaTree( $element ) )
01229             {
01230                 if ( $debug )
01231                 {
01232                     eZDebug::writeDebug( $this->Document->saveXML(),
01233                                          eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
01234                                                                       'xml string after failed processBySchemaTree for new element ' . $element->nodeName ) );
01235                 }
01236                 continue;
01237             }
01238 
01239             if ( $debug )
01240             {
01241                 eZDebug::writeDebug( $this->Document->saveXML(),
01242                                      eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
01243                                                                   'xml string after processBySchemaTree for new element ' . $element->nodeName ) );
01244             }
01245 
01246 
01247             $tmp2 = null;
01248             // Call "Publish handler"
01249             $this->callOutputHandler( 'publishHandler', $element, $tmp2 );
01250 
01251             if ( $debug )
01252             {
01253                 eZDebug::writeDebug( $this->Document->saveXML(),
01254                                      eZDebugSetting::changeLabel( 'kernel-datatype-ezxmltext',
01255                                                                   'xml string after callOutputHandler publishHandler for new element ' . $element->nodeName ) );
01256             }
01257 
01258             // Process attributes according to the schema
01259             if( $element->hasAttributes() )
01260             {
01261                 if ( !$this->XMLSchema->hasAttributes( $element ) )
01262                 {
01263                     eZXMLInputParser::removeAllAttributes( $element );
01264                 }
01265                 else
01266                 {
01267                     $this->processAttributesBySchema( $element );
01268                 }
01269             }
01270         }
01271     }
01272 
01273     /// \public
01274     function getMessages()
01275     {
01276         return $this->Messages;
01277     }
01278 
01279     /// \public
01280     function isValid()
01281     {
01282         return $this->IsInputValid;
01283     }
01284 
01285     function handleError( $type, $message )
01286     {
01287         if ( $type & $this->DetectErrorLevel )
01288         {
01289             $this->IsInputValid = false;
01290             if ( $message )
01291             {
01292                 $this->Messages[] = $message;
01293             }
01294         }
01295 
01296         if ( $type & $this->ValidateErrorLevel )
01297         {
01298             $this->IsInputValid = false;
01299             $this->QuitProcess = true;
01300         }
01301     }
01302 
01303     public $DOMDocumentClass = 'DOMDocument';
01304 
01305     public $XMLSchema;
01306     public $Document = null;
01307     public $Messages = array();
01308     public $eZPublishVersion;
01309 
01310     public $ParentStack = array();
01311 
01312     public $ValidateErrorLevel;
01313     public $DetectErrorLevel;
01314 
01315     public $IsInputValid = true;
01316     public $QuitProcess = false;
01317 
01318     // options that depend on settings
01319     public $TrimSpaces = true;
01320     public $AllowMultipleSpaces = false;
01321     public $AllowNumericEntities = false;
01322     public $StrictHeaders = false;
01323 
01324     // options that depend on parameters passed
01325     public $ParseLineBreaks = false;
01326     public $RemoveDefaultAttrs = false;
01327 }
01328 ?>