eZ Publish  [4.0]
ezchartransform.php
Go to the documentation of this file.
00001 <?php
00002 //
00003 // Definition of eZCharTransform class
00004 //
00005 // Created on: <16-Jul-2004 15:54:21 amos>
00006 //
00007 // ## BEGIN COPYRIGHT, LICENSE AND WARRANTY NOTICE ##
00008 // SOFTWARE NAME: eZ Publish
00009 // SOFTWARE RELEASE: 4.0.x
00010 // COPYRIGHT NOTICE: Copyright (C) 1999-2008 eZ Systems AS
00011 // SOFTWARE LICENSE: GNU General Public License v2.0
00012 // NOTICE: >
00013 //   This program is free software; you can redistribute it and/or
00014 //   modify it under the terms of version 2.0  of the GNU General
00015 //   Public License as published by the Free Software Foundation.
00016 //
00017 //   This program is distributed in the hope that it will be useful,
00018 //   but WITHOUT ANY WARRANTY; without even the implied warranty of
00019 //   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020 //   GNU General Public License for more details.
00021 //
00022 //   You should have received a copy of version 2.0 of the GNU General
00023 //   Public License along with this program; if not, write to the Free
00024 //   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00025 //   MA 02110-1301, USA.
00026 //
00027 //
00028 // ## END COPYRIGHT, LICENSE AND WARRANTY NOTICE ##
00029 //
00030 
00031 /*! \file ezchartransform.php
00032 */
00033 
00034 /*!
00035   \class eZCharTransform ezchartransform.php
00036   \ingroup eZI18N
00037   \brief Performs rule based transformation of characters in a string
00038 
00039   \sa eZCodeMapper
00040 */
00041 
00042 //include_once( 'lib/ezi18n/classes/eztextcodec.php' );
00043 //include_once( 'lib/ezi18n/classes/ezcharsetinfo.php' );
00044 
00045 class eZCharTransform
00046 {
00047     /// The timestamp for when the format of the cache files were
00048     /// last changed. This must be updated when the format changes
00049     /// to invalidate existing cache files.
00050     /// 1101288452
00051     /// 30. Jan. 2007 - 1170165730
00052     /// 24. Apr. 2007 - 1177423380
00053     const CODE_DATE = 1177423380;
00054 
00055     /*!
00056      Constructor
00057     */
00058     function eZCharTransform()
00059     {
00060     }
00061 
00062     /*!
00063      Transforms the text according to the rules defined in \a $rule using character set \a $charset.
00064      \param $text The text string to be converted, currently Unicode arrays are not supported
00065      \param $rule Which transformation rule to use, can either be a string identifier or an array with identifiers.
00066      \param $charset Which charset to use when transforming, if \c false it will use current charset (i18n.ini).
00067      \param $useCache If \c true then it will use cache files for the mapping,
00068                       if not it will have to calculate them each time.
00069     */
00070     function transform( $text, $rule, $charset = false, $useCache = true )
00071     {
00072         if ( $text === '' )
00073         {
00074             return $text;
00075         }
00076 
00077         if ( $useCache )
00078         {
00079             // CRC32 is used for speed, MD5 would be more unique but is slower
00080             //include_once( 'lib/ezutils/classes/ezsys.php' );
00081             $key = eZSys::ezcrc32( 'Rule: ' . ( is_array( $rule ) ? implode( ',', $rule ) : $rule ) . '-' . $charset );
00082             $filepath = $this->cacheFilePath( 'rule-',
00083                                               '-' . $charsetName,
00084                                               $key );
00085 
00086             $charsetName = ( $charset === false ? eZTextCodec::internalCharset() : eZCharsetInfo::realCharsetCode( $charset ) );
00087 
00088             // Try to execute code in the cache file, if it succeeds
00089             // \a $text will/ transformated
00090             $retText = $this->executeCacheFile( $text, $filepath );
00091             if ( $retText !== false )
00092             {
00093                 return $retText;
00094             }
00095         }
00096 
00097         // Make sure we have a mapper
00098         $mapper = new eZCodeMapper();
00099 
00100         $mapper->loadTransformationFiles( $charsetName, false );
00101 
00102         // First generate a unicode based mapping table from the rules
00103         $unicodeTable = $mapper->generateMappingCode( $rule );
00104         unset($unicodeTable[0]);
00105         // Then transform that to a table that works with the current charset
00106         // Any character not available in the current charset will be removed
00107         $charsetTable = $mapper->generateCharsetMappingTable( $unicodeTable, $charset );
00108         $transformationData = array( 'table' => $charsetTable );
00109         unset( $unicodeTable );
00110 
00111         if ( $useCache )
00112         {
00113             $extraCode = '';
00114             $this->storeCacheFile( $filepath, $transformationData,
00115                                    $extraCode,
00116                                    'Rule', $charsetName );
00117         }
00118 
00119         // Execute transformations
00120         return strtr( $text, $transformationData['table'] );
00121     }
00122 
00123     /*!
00124      Transforms the text according to the rules defined in \a $rule using character set \a $charset.
00125      \param $text The text string to be converted, currently Unicode arrays are not supported
00126      \param $rule Which transformation rule to use, can either be a string identifier or an array with identifiers.
00127      \param $charset Which charset to use when transforming, if \c false it will use current charset (i18n.ini).
00128      \param $useCache If \c true then it will use cache files for the tables,
00129                       if not it will have to calculate them each time.
00130     */
00131     function transformByGroup( $text, $group, $charset = false, $useCache = true )
00132     {
00133         if ( $text === '' )
00134         {
00135             return $text;
00136         }
00137         $charsetName = ( $charset === false ? eZTextCodec::internalCharset() : eZCharsetInfo::realCharsetCode( $charset ) );
00138         if ( $useCache )
00139         {
00140             // CRC32 is used for speed, MD5 would be more unique but is slower
00141             //include_once( 'lib/ezutils/classes/ezsys.php' );
00142 
00143             $keyText = 'Group:' . $group;
00144             $key = eZSys::ezcrc32( $keyText . '-' . $charset );
00145             $filepath = $this->cacheFilePath( 'g-' . $group . '-',
00146                                               '-' . $charsetName,
00147                                               $key);
00148 
00149             // Try to execute code in the cache file, if it succeeds
00150             // \a $text will/ transformated
00151             $retText = $this->executeCacheFile( $text, $filepath );
00152             if ( $retText !== false )
00153             {
00154                 return $retText;
00155             }
00156         }
00157 
00158         $commands = $this->groupCommands( $group );
00159         if ( $commands === false )
00160             return false;
00161 
00162         $mapper = new eZCodeMapper();
00163 
00164         $mapper->loadTransformationFiles( $charsetName, $group );
00165 
00166         $rules = array();
00167         foreach ( $commands as $command )
00168         {
00169             $rules = array_merge( $rules,
00170                                   $mapper->decodeCommand( $command['command'], $command['parameters'] ) );
00171         }
00172 
00173         // First generate a unicode based mapping table from the rules
00174         $unicodeTable = $mapper->generateMappingCode( $rules );
00175         unset($unicodeTable[0]);
00176         // Then transform that to a table that works with the current charset
00177         // Any character not available in the current charset will be removed
00178         $charsetTable = $mapper->generateCharsetMappingTable( $unicodeTable, $charset );
00179         $transformationData = array( 'table' => $charsetTable );
00180         unset( $unicodeTable );
00181 
00182         if ( $useCache )
00183         {
00184             $extraCode = '';
00185             foreach ( $commands as $command )
00186             {
00187                 $code = $mapper->generateCommandCode( $command, $charsetName );
00188                 if ( $code !== false )
00189                 {
00190                     $extraCode .= $code . "\n";
00191                 }
00192             }
00193             $this->storeCacheFile( $filepath, $transformationData,
00194                                    $extraCode,
00195                                    'Group:' . $group, $charsetName );
00196         }
00197 
00198         // Execute transformations
00199         $text = strtr( $text, $transformationData['table'] );
00200 
00201         // Execute custom code
00202         foreach ( $commands as $command )
00203         {
00204             $mapper->executeCommandCode( $text, $command, $charsetName );
00205         }
00206 
00207         return $text;
00208     }
00209 
00210     /*!
00211      \private
00212      \static
00213      \return the path of the cached transformation tables.
00214     */
00215     function cachedTransformationPath()
00216     {
00217         $dir =& $GLOBALS['eZCodeMapperCachePath'];
00218         if ( isset( $dir ) )
00219             return $dir;
00220 
00221         //include_once( 'lib/ezutils/classes/ezsys.php' );
00222         $sys = eZSys::instance();
00223         $dir = $sys->cacheDirectory() . '/trans';
00224         return $dir;
00225     }
00226 
00227     /*!
00228      \private
00229      Finds all commands defined for group \a $group.
00230      The groups and their commands are defined in \c transform.ini.
00231 
00232      \return An array with commands, each entry contains of:
00233              - command - Name of the command
00234              - parameters - Array with parameters for command
00235              - text - Textual representation of the command + parameters
00236     */
00237     function groupCommands( $group )
00238     {
00239         $rules =& $this->GroupRules[$group];
00240         if ( isset( $rules ) )
00241             return $rules;
00242 
00243         $ini = eZINI::instance( 'transform.ini' );
00244         $groups = $ini->variable( 'Transformation', 'Groups' );
00245         if ( !in_array( $group, $groups ) )
00246         {
00247             eZDebug::writeError( "Transformation group $group is not part of the active group list Groups in transform.ini",
00248                                  'eZCharTransform::groupCommands' );
00249             return false;
00250         }
00251 
00252         if ( !$ini->hasGroup( $group ) )
00253         {
00254             eZDebug::writeError( "Transformation group $group is missing in transform.ini",
00255                                  'eZCharTransform::groupCommands' );
00256             return false;
00257         }
00258 
00259         $rules = array();
00260         $ruleTexts = $ini->variable( $group, 'Commands' );
00261         foreach ( $ruleTexts as $ruleText )
00262         {
00263             if ( preg_match( "#^([a-zA-Z][a-zA-Z0-9_-]+)(\((.+)\))?$#", $ruleText, $matches ) )
00264             {
00265                 $command = $matches[1];
00266                 $parameters = array();
00267                 if ( isset( $matches[2] ) )
00268                 {
00269                     $parameters = explode( ',', $matches[3] );
00270                 }
00271                 $rules[] = array( 'command' => $command,
00272                                   'parameters' => $parameters );
00273             }
00274         }
00275 
00276         return $rules;
00277     }
00278 
00279     /*!
00280      Get cache file path.
00281 
00282      \param $prefix
00283      \param $suffix
00284      \param $key
00285 
00286      \return cache file path.
00287     */
00288     function cacheFilePath( $prefix, $suffix, $key )
00289     {
00290         $path = eZCharTransform::cachedTransformationPath();
00291         if ( !file_exists( $path ) )
00292         {
00293             //include_once( 'lib/ezfile/classes/ezdir.php' );
00294             eZDir::mkdir( $path, false, true );
00295         }
00296         return $path . '/' . $prefix . sprintf( "%u", $key ) . $suffix . '.ctt.php'; // ctt=charset transform table
00297     }
00298 
00299     /*!
00300      \private
00301      \param $text The text that should be transformed
00302      \param $key The unique key for the cache, this should be a CRC32 or MD5 of
00303                  the current rules or commands which are used.
00304      \param $timestamp A timestamp value which is matched against the cache file,
00305                        pass for instance the timestamp of the INI file.
00306      \param[out] $filepath The filepath for the cache file will be generated here,
00307                            this can be used for the storeCacheFile() method.
00308      \return The restored transformation data or \c false if there is no cached data.
00309     */
00310     protected function executeCacheFile( $text, $filepath, $timestamp = false )
00311     {
00312         if ( file_exists( $filepath ) )
00313         {
00314             $time = filemtime( $filepath );
00315             $ini = eZINI::instance( 'transform.ini' );
00316             if ( $ini->CacheFile && file_exists( $ini->CacheFile ) && $time < filemtime( $ini->CacheFile ) )
00317             {
00318                 return false;
00319             }
00320             if ( $time >= max( self::CODE_DATE, $timestamp ) )
00321             {
00322                 // Execute the PHP file causing $text will be transformed
00323                 include "$filepath";
00324                 return $text;
00325             }
00326         }
00327         return false;
00328     }
00329 
00330     /*!
00331      \private
00332      Stores the mapping table \a $table in the cache file \a $filepath.
00333     */
00334     function storeCacheFile( $filepath, $transformationData,$extraCode, $type, $charsetName )
00335     {
00336         $file = basename( $filepath );
00337         $dir = dirname( $filepath );
00338         $php = new eZPHPCreator( $dir, $file );
00339 
00340         $php->addComment( "Cached transformation data" );
00341         $php->addComment( "Type: $type" );
00342         $php->addComment( "Charset: $charsetName" );
00343         $php->addComment( "Cached transformation data" );
00344 
00345         $php->addCodePiece( '$data = ' . eZCharTransform::varExport( $transformationData ) . ";\n" );
00346         $php->addCodePiece( "\$text = strtr( \$text, \$data['table'] );\n" );
00347 
00348         if ( $extraCode )
00349         {
00350             $php->addCodePiece( $extraCode );
00351         }
00352 
00353         return $php->store( true );
00354     }
00355 
00356     /*!
00357      \private
00358      Creates a text representation of the value \a $value which can
00359      be placed in files and be read back by a PHP parser as it was.
00360      The type of the values determines the output, it can be one of the following.
00361      - boolean, becomes \c true or \c false
00362      - null, becomes \c null
00363      - string, adds \ (backslash) to backslashes, double quotes, dollar signs and newlines.
00364                Then wraps the whole string in " (double quotes).
00365      - numeric, displays the value as-is.
00366      - array, expands all value recursively using this function
00367      - object, creates a representation of an object creation if the object has \c serializeData implemented.
00368 
00369      \param $column Determines the starting column in which the text will be placed.
00370                     This is used for expanding arrays and objects which can span multiple lines.
00371      \param $iteration The current iteration, starts at 0 and increases with 1 for each recursive call
00372 
00373     */
00374     static function varExport( $value )
00375     {
00376         return var_export( $value, true );
00377     }
00378 
00379     /*!
00380      \private
00381      \static
00382      Creates a text representation of the value \a $value which can
00383      be placed in files and be read back by a PHP parser as it was.
00384      Meant as a replacement for PHP versions with broken var_export.
00385     */
00386     static function varExportInternal( $value, $column = 0, $iteration = 0 )
00387     {
00388 
00389         if ( is_bool( $value ) )
00390             $text = ( $value ? 'true' : 'false' );
00391         else if ( is_null( $value ) )
00392             $text = 'null';
00393         else if ( is_string( $value ) )
00394         {
00395             $valueText = str_replace( array( "\\",
00396                                              "\"",
00397                                              "\$",
00398                                              "\n" ),
00399                                       array( "\\\\",
00400                                              "\\\"",
00401                                              "\\$",
00402                                              "\\n" ),
00403                                       $value );
00404             $text = "\"$valueText\"";
00405         }
00406         else if ( is_numeric( $value ) )
00407             $text = $value;
00408         else if ( is_object( $value ) )
00409         {
00410             $text = '';
00411             if ( method_exists( $value, 'serializedata' ) )
00412             {
00413                 $serializeData = $value->serializeData();
00414                 $className = $serializeData['class_name'];
00415                 $text = "new $className(";
00416 
00417                 $column += strlen( $text );
00418                 $parameters = $serializeData['parameters'];
00419                 $variables = $serializeData['variables'];
00420 
00421                 $i = 0;
00422                 foreach ( $parameters as $parameter )
00423                 {
00424                     if ( $i > 0 )
00425                     {
00426                         $text .= ",\n" . str_repeat( ' ', $column );
00427                     }
00428                     $variableName = $variables[$parameter];
00429                     $variableValue = $value->$variableName;
00430                     $keyText = " ";
00431                     $text .= $keyText . eZCharTransform::varExportInternal( $variableValue, $column + strlen( $keyText  ), $iteration + 1 );
00432                     ++$i;
00433                 }
00434                 if ( $i > 0 )
00435                     $text .= ' ';
00436 
00437                 $text .= ')';
00438             }
00439         }
00440         else if ( is_array( $value ) )
00441         {
00442             $text = 'array(';
00443             $column += strlen( $text );
00444             $valueKeys = array_keys( $value );
00445             $isIndexed = true;
00446             for ( $i = 0; $i < count( $valueKeys ); ++$i )
00447             {
00448                 if ( $i !== $valueKeys[$i] )
00449                 {
00450                     $isIndexed = false;
00451                     break;
00452                 }
00453             }
00454             $i = 0;
00455             foreach ( $valueKeys as $key )
00456             {
00457                 if ( $i > 0 )
00458                 {
00459                     $text .= ",\n" . str_repeat( ' ', $column );
00460                 }
00461                 $element =& $value[$key];
00462                 $keyText = ' ';
00463                 if ( !$isIndexed )
00464                 {
00465                     if ( is_int( $key ) )
00466                         $keyText = $key;
00467                     else
00468                         $keyText = "\"" . str_replace( array( "\\",
00469                                                               "\"",
00470                                                               "\n" ),
00471                                                        array( "\\\\",
00472                                                               "\\\"",
00473                                                               "\\n" ),
00474                                                        $key ) . "\"";
00475                     $keyText = " $keyText => ";
00476                 }
00477                 $text .= $keyText . eZCharTransform::varExportInternal( $element, $column + strlen( $keyText  ), $iteration + 1 );
00478                 ++$i;
00479             }
00480             if ( $i > 0 )
00481                 $text .= ' ';
00482             $text .= ')';
00483         }
00484         else
00485             $text = 'null';
00486         return $text;
00487     }
00488 
00489     /*!
00490      \static
00491      Returns the current word separator, if none is found it will read from site.ini URLTranslator/WordSeparator
00492      \sa setWordSeparator
00493      */
00494     static function wordSeparator()
00495     {
00496         if ( isset( $GLOBALS['eZCharTransform_wordSeparator'] ) )
00497         {
00498             return $GLOBALS['eZCharTransform_wordSeparator'];
00499         }
00500         else
00501         {
00502             $ini = eZINI::instance();
00503             $separator = strtolower( $ini->variable( "URLTranslator", "WordSeparator" ) );
00504             switch ( $separator )
00505             {
00506                 case 'dash':
00507                     $separator = '-';
00508                     break;
00509                 case 'underscore':
00510                     $separator = '_';
00511                     break;
00512                 case 'space':
00513                     $separator = ' ';
00514                     break;
00515                 default:
00516                     return '-';
00517             }
00518             $GLOBALS['eZCharTransform_wordSeparator'] = $separator;
00519             return $separator;
00520         }
00521     }
00522 
00523     /*!
00524      Sets the current word separator, set it to \c null to use default value.
00525      */
00526     function setWordSeparator( $char )
00527     {
00528         $GLOBALS['eZCharTransform_wordSeparator'] = $char;
00529     }
00530 
00531     static function commandUrlCleanupCompat( $text, $charsetName )
00532     {
00533         // Old style of url alias with lowercase only and underscores for separators
00534         $text = strtolower( $text );
00535         $text = preg_replace( array( "#[^a-z0-9]+#",
00536                                      "#^_+|_+$#" ),
00537                               array( "_",
00538                                      "" ),
00539                               $text );
00540         return $text;
00541     }
00542 
00543     static function commandUrlCleanup( $text, $charsetName )
00544     {
00545         $sep  = eZCharTransform::wordSeparator();
00546         $sepQ = preg_quote( $sep );
00547         $text = preg_replace( array( "#[^a-zA-Z0-9_!.-]+#",
00548                                      "#^[.]+|[!.]+$#", # Remove dots at beginning/end
00549                                      "#\.\.+#", # Remove double dots
00550                                      "#[{$sepQ}]+#", # Turn multiple separators into one
00551                                      "#^[{$sepQ}]+|[{$sepQ}]+$#" ), # Strip separator from beginning/end
00552                               array( $sep,
00553                                      $sep,
00554                                      $sep,
00555                                      $sep,
00556                                      "" ),
00557                               $text );
00558         return $text;
00559     }
00560 
00561     static function commandUrlCleanupIRI( $text, $charsetName )
00562     {
00563         // With IRI support we keep all characters except some reserved ones,
00564         // they are space, ampersand, semi-colon, forward slash, colon, equal sign, question mark,
00565         //          square brackets, parenthesis, plus.
00566         //
00567         // Note: Space is turned into a dash to make it easier for people to
00568         //       paste urls from the system and have the whole url recognized
00569         //       instead of being broken off
00570         $sep  = eZCharTransform::wordSeparator();
00571         $sepQ = preg_quote( $sep );
00572         $prepost = " ." . $sepQ;
00573         if ( $sep != "-" )
00574             $prepost .= "-";
00575         $text = preg_replace( array( "#[ \\\\%\#&;/:=?\[\]()+]+#",
00576                                      "#^[.]+|[!.]+$#", # Remove dots at beginning/end
00577                                      "#\.\.+#", # Remove double dots
00578                                      "#[{$sepQ}]+#", # Turn multiple separators into one
00579                                      "#^[{$prepost}]+|[{$prepost}]+$#" ),
00580                               array( $sep,
00581                                      $sep,
00582                                      $sep,
00583                                      $sep,
00584                                      "" ),
00585                               $text );
00586         return $text;
00587     }
00588 
00589     /*!
00590      \return The unique instance of the character transformer.
00591     */
00592     static function instance()
00593     {
00594         $instance =& $GLOBALS['eZCharTransformInstance'];
00595         if ( !isset( $instance ) )
00596         {
00597             $instance = new eZCharTransform();
00598         }
00599         return $instance;
00600     }
00601 }
00602 
00603 ?>