eZ Publish  [trunk]
ezchartransform.php
Go to the documentation of this file.
00001 <?php
00002 /**
00003  * File containing the eZCharTransform class.
00004  *
00005  * @copyright Copyright (C) 1999-2012 eZ Systems AS. All rights reserved.
00006  * @license http://www.gnu.org/licenses/gpl-2.0.txt GNU General Public License v2
00007  * @version //autogentag//
00008  * @package lib
00009  */
00010 
00011 /*!
00012   \class eZCharTransform ezchartransform.php
00013   \ingroup eZI18N
00014   \brief Performs rule based transformation of characters in a string
00015 
00016   \sa eZCodeMapper
00017 */
00018 
00019 class eZCharTransform
00020 {
00021     /// The timestamp for when the format of the cache files were
00022     /// last changed. This must be updated when the format changes
00023     /// to invalidate existing cache files.
00024     /// 1101288452
00025     /// 30. Jan. 2007 - 1170165730
00026     /// 24. Apr. 2007 - 1177423380
00027     const CODE_DATE = 1177423380;
00028 
00029     /*!
00030      Constructor
00031     */
00032     function eZCharTransform()
00033     {
00034     }
00035 
00036     /*!
00037      Transforms the text according to the rules defined in \a $rule using character set \a $charset.
00038      \param $text The text string to be converted, currently Unicode arrays are not supported
00039      \param $rule Which transformation rule to use, can either be a string identifier or an array with identifiers.
00040      \param $charset Which charset to use when transforming, if \c false it will use current charset (i18n.ini).
00041      \param $useCache If \c true then it will use cache files for the mapping,
00042                       if not it will have to calculate them each time.
00043     */
00044     function transform( $text, $rule, $charset = false, $useCache = true )
00045     {
00046         if ( $text === '' )
00047         {
00048             return $text;
00049         }
00050 
00051         if ( $useCache )
00052         {
00053             // CRC32 is used for speed, MD5 would be more unique but is slower
00054             $key = eZSys::ezcrc32( 'Rule: ' . ( is_array( $rule ) ? implode( ',', $rule ) : $rule ) . '-' . $charset );
00055             $filepath = $this->cacheFilePath( 'rule-',
00056                                               '-' . $charsetName,
00057                                               $key );
00058 
00059             $charsetName = ( $charset === false ? eZTextCodec::internalCharset() : eZCharsetInfo::realCharsetCode( $charset ) );
00060 
00061             // Try to execute code in the cache file, if it succeeds
00062             // \a $text will/ transformated
00063             $retText = $this->executeCacheFile( $text, $filepath );
00064             if ( $retText !== false )
00065             {
00066                 return $retText;
00067             }
00068         }
00069 
00070         // Make sure we have a mapper
00071         $mapper = new eZCodeMapper();
00072 
00073         $mapper->loadTransformationFiles( $charsetName, false );
00074 
00075         // First generate a unicode based mapping table from the rules
00076         $unicodeTable = $mapper->generateMappingCode( $rule );
00077         unset($unicodeTable[0]);
00078         // Then transform that to a table that works with the current charset
00079         // Any character not available in the current charset will be removed
00080         $charsetTable = $mapper->generateCharsetMappingTable( $unicodeTable, $charset );
00081         $transformationData = array( 'table' => $charsetTable );
00082         unset( $unicodeTable );
00083 
00084         if ( $useCache )
00085         {
00086             $extraCode = '';
00087             $this->storeCacheFile( $filepath, $transformationData,
00088                                    $extraCode,
00089                                    'Rule', $charsetName );
00090         }
00091 
00092         // Execute transformations
00093         return strtr( $text, $transformationData['table'] );
00094     }
00095 
00096     /*!
00097      Transforms the text according to the rules defined in \a $rule using character set \a $charset.
00098      \param $text The text string to be converted, currently Unicode arrays are not supported
00099      \param $group Which transformation group to use, of which the rules will be applied.
00100      \param $charset Which charset to use when transforming, if \c false it will use current charset (i18n.ini).
00101      \param $useCache If \c true then it will use cache files for the tables,
00102                       if not it will have to calculate them each time.
00103     */
00104     function transformByGroup( $text, $group, $charset = false, $useCache = true )
00105     {
00106         if ( $text === '' )
00107         {
00108             return $text;
00109         }
00110         $charsetName = ( $charset === false ? eZTextCodec::internalCharset() : eZCharsetInfo::realCharsetCode( $charset ) );
00111         if ( $useCache )
00112         {
00113             // CRC32 is used for speed, MD5 would be more unique but is slower
00114             $keyText = 'Group:' . $group;
00115             $key = eZSys::ezcrc32( $keyText . '-' . $charset );
00116             $filepath = $this->cacheFilePath( 'g-' . $group . '-',
00117                                               '-' . $charsetName,
00118                                               $key);
00119 
00120             // Try to execute code in the cache file, if it succeeds
00121             // \a $text will/ transformated
00122             $retText = $this->executeCacheFile( $text, $filepath );
00123             if ( $retText !== false )
00124             {
00125                 return $retText;
00126             }
00127         }
00128 
00129         $commands = $this->groupCommands( $group );
00130         if ( $commands === false )
00131             return false;
00132 
00133         $mapper = new eZCodeMapper();
00134 
00135         $mapper->loadTransformationFiles( $charsetName, $group );
00136 
00137         $rules = array();
00138         foreach ( $commands as $command )
00139         {
00140             $rules = array_merge( $rules,
00141                                   $mapper->decodeCommand( $command['command'], $command['parameters'] ) );
00142         }
00143 
00144         // First generate a unicode based mapping table from the rules
00145         $unicodeTable = $mapper->generateMappingCode( $rules );
00146         unset($unicodeTable[0]);
00147         // Then transform that to a table that works with the current charset
00148         // Any character not available in the current charset will be removed
00149         $charsetTable = $mapper->generateCharsetMappingTable( $unicodeTable, $charset );
00150         $transformationData = array( 'table' => $charsetTable );
00151         unset( $unicodeTable );
00152 
00153         if ( $useCache )
00154         {
00155             $extraCode = '';
00156             foreach ( $commands as $command )
00157             {
00158                 $code = $mapper->generateCommandCode( $command, $charsetName );
00159                 if ( $code !== false )
00160                 {
00161                     $extraCode .= $code . "\n";
00162                 }
00163             }
00164             $this->storeCacheFile( $filepath, $transformationData,
00165                                    $extraCode,
00166                                    'Group:' . $group, $charsetName );
00167         }
00168 
00169         // Execute transformations
00170         $text = strtr( $text, $transformationData['table'] );
00171 
00172         // Execute custom code
00173         foreach ( $commands as $command )
00174         {
00175             $mapper->executeCommandCode( $text, $command, $charsetName );
00176         }
00177 
00178         return $text;
00179     }
00180 
00181     /*!
00182      \private
00183      \static
00184      \return the path of the cached transformation tables.
00185     */
00186     function cachedTransformationPath()
00187     {
00188         $dir =& $GLOBALS['eZCodeMapperCachePath'];
00189         if ( isset( $dir ) )
00190             return $dir;
00191 
00192         $sys = eZSys::instance();
00193         $dir = $sys->cacheDirectory() . '/trans';
00194         return $dir;
00195     }
00196 
00197     /*!
00198      \private
00199      Finds all commands defined for group \a $group.
00200      The groups and their commands are defined in \c transform.ini.
00201 
00202      \return An array with commands, each entry contains of:
00203              - command - Name of the command
00204              - parameters - Array with parameters for command
00205              - text - Textual representation of the command + parameters
00206     */
00207     function groupCommands( $group )
00208     {
00209         $rules =& $this->GroupRules[$group];
00210         if ( isset( $rules ) )
00211             return $rules;
00212 
00213         $ini = eZINI::instance( 'transform.ini' );
00214         $groups = $ini->variable( 'Transformation', 'Groups' );
00215         if ( !in_array( $group, $groups ) )
00216         {
00217             eZDebug::writeError( "Transformation group $group is not part of the active group list Groups in transform.ini", __METHOD__ );
00218             return false;
00219         }
00220 
00221         if ( !$ini->hasGroup( $group ) )
00222         {
00223             eZDebug::writeError( "Transformation group $group is missing in transform.ini", __METHOD__ );
00224             return false;
00225         }
00226 
00227         $rules = array();
00228         $ruleTexts = $ini->variable( $group, 'Commands' );
00229         foreach ( $ruleTexts as $ruleText )
00230         {
00231             if ( preg_match( "#^([a-zA-Z][a-zA-Z0-9_-]+)(\((.+)\))?$#", $ruleText, $matches ) )
00232             {
00233                 $command = $matches[1];
00234                 $parameters = array();
00235                 if ( isset( $matches[2] ) )
00236                 {
00237                     $parameters = explode( ',', $matches[3] );
00238                 }
00239                 $rules[] = array( 'command' => $command,
00240                                   'parameters' => $parameters );
00241             }
00242         }
00243 
00244         return $rules;
00245     }
00246 
00247     /*!
00248      Get cache file path.
00249 
00250      \param $prefix
00251      \param $suffix
00252      \param $key
00253 
00254      \return cache file path.
00255     */
00256     function cacheFilePath( $prefix, $suffix, $key )
00257     {
00258         $path = eZCharTransform::cachedTransformationPath();
00259         if ( !file_exists( $path ) )
00260         {
00261             eZDir::mkdir( $path, false, true );
00262         }
00263         return $path . '/' . $prefix . sprintf( "%u", $key ) . $suffix . '.ctt.php'; // ctt=charset transform table
00264     }
00265 
00266     /*!
00267      \private
00268      \param $text The text that should be transformed
00269      \param $filepath The filepath for the cache file
00270      \param $timestamp A timestamp value which is matched against the cache file,
00271                        pass for instance the timestamp of the INI file.
00272 
00273      \return The restored transformation data or \c false if there is no cached data.
00274     */
00275     protected function executeCacheFile( $text, $filepath, $timestamp = false )
00276     {
00277         if ( file_exists( $filepath ) )
00278         {
00279             $time = filemtime( $filepath );
00280             $ini = eZINI::instance( 'transform.ini' );
00281             if ( $ini->CacheFile && file_exists( $ini->CacheFile ) && $time < filemtime( $ini->CacheFile ) )
00282             {
00283                 return false;
00284             }
00285             if ( $time >= max( self::CODE_DATE, $timestamp ) )
00286             {
00287                 // Execute the PHP file causing $text will be transformed
00288                 include "$filepath";
00289                 return $text;
00290             }
00291         }
00292         return false;
00293     }
00294 
00295     /*!
00296      \private
00297      Stores the mapping table \a $table in the cache file \a $filepath.
00298     */
00299     function storeCacheFile( $filepath, $transformationData,$extraCode, $type, $charsetName )
00300     {
00301         $file = basename( $filepath );
00302         $dir = dirname( $filepath );
00303         $php = new eZPHPCreator( $dir, $file );
00304 
00305         $php->addComment( "Cached transformation data" );
00306         $php->addComment( "Type: $type" );
00307         $php->addComment( "Charset: $charsetName" );
00308         $php->addComment( "Cached transformation data" );
00309 
00310         $php->addCodePiece( '$data = ' . eZCharTransform::varExport( $transformationData ) . ";\n" );
00311         $php->addCodePiece( "\$text = strtr( \$text, \$data['table'] );\n" );
00312 
00313         if ( $extraCode )
00314         {
00315             $php->addCodePiece( $extraCode );
00316         }
00317 
00318         return $php->store( true );
00319     }
00320 
00321     /*!
00322      \private
00323      Creates a text representation of the value \a $value which can
00324      be placed in files and be read back by a PHP parser as it was.
00325      The type of the values determines the output, it can be one of the following.
00326      - boolean, becomes \c true or \c false
00327      - null, becomes \c null
00328      - string, adds \ (backslash) to backslashes, double quotes, dollar signs and newlines.
00329                Then wraps the whole string in " (double quotes).
00330      - numeric, displays the value as-is.
00331      - array, expands all value recursively using this function
00332      - object, creates a representation of an object creation if the object has \c serializeData implemented.
00333     */
00334     static function varExport( $value )
00335     {
00336         return var_export( $value, true );
00337     }
00338 
00339     /*!
00340      \static
00341      Returns the current word separator, if none is found it will read from site.ini URLTranslator/WordSeparator
00342      \sa setWordSeparator
00343      */
00344     static function wordSeparator()
00345     {
00346         if ( isset( $GLOBALS['eZCharTransform_wordSeparator'] ) )
00347         {
00348             return $GLOBALS['eZCharTransform_wordSeparator'];
00349         }
00350         else
00351         {
00352             $ini = eZINI::instance();
00353             $separator = strtolower( $ini->variable( "URLTranslator", "WordSeparator" ) );
00354             switch ( $separator )
00355             {
00356                 case 'dash':
00357                     $separator = '-';
00358                     break;
00359                 case 'underscore':
00360                     $separator = '_';
00361                     break;
00362                 case 'space':
00363                     $separator = ' ';
00364                     break;
00365                 default:
00366                     return '-';
00367             }
00368             $GLOBALS['eZCharTransform_wordSeparator'] = $separator;
00369             return $separator;
00370         }
00371     }
00372 
00373     /*!
00374      Sets the current word separator, set it to \c null to use default value.
00375      */
00376     function setWordSeparator( $char )
00377     {
00378         $GLOBALS['eZCharTransform_wordSeparator'] = $char;
00379     }
00380 
00381     static function commandUrlCleanupCompat( $text, $charsetName )
00382     {
00383         // Old style of url alias with lowercase only and underscores for separators
00384         $text = strtolower( $text );
00385         $text = preg_replace( array( "#[^a-z0-9]+#",
00386                                      "#^_+|_+$#" ),
00387                               array( "_",
00388                                      "" ),
00389                               $text );
00390         return $text;
00391     }
00392 
00393     static function commandUrlCleanup( $text, $charsetName )
00394     {
00395         $sep  = eZCharTransform::wordSeparator();
00396         $sepQ = preg_quote( $sep );
00397         $text = preg_replace( array( "#[^a-zA-Z0-9_!.-]+#",
00398                                      "#^[.]+|[!.]+$#", # Remove dots at beginning/end
00399                                      "#\.\.+#", # Remove double dots
00400                                      "#[{$sepQ}]+#", # Turn multiple separators into one
00401                                      "#^[{$sepQ}]+|[{$sepQ}]+$#" ), # Strip separator from beginning/end
00402                               array( $sep,
00403                                      $sep,
00404                                      $sep,
00405                                      $sep,
00406                                      "" ),
00407                               $text );
00408         return $text;
00409     }
00410 
00411     static function commandUrlCleanupIRI( $text, $charsetName )
00412     {
00413         // With IRI support we keep all characters except some reserved ones,
00414         // they are space, ampersand, semi-colon, forward slash, colon, equal sign, question mark,
00415         //          square brackets, parenthesis, plus.
00416         //
00417         // Note: Space is turned into a dash to make it easier for people to
00418         //       paste urls from the system and have the whole url recognized
00419         //       instead of being broken off
00420         $sep  = eZCharTransform::wordSeparator();
00421         $sepQ = preg_quote( $sep );
00422         $prepost = " ." . $sepQ;
00423         if ( $sep != "-" )
00424             $prepost .= "-";
00425         $text = preg_replace( array( "#[ \\\\%\#&;/:=?\[\]()+]+#",
00426                                      "#^[.]+|[!.]+$#", # Remove dots at beginning/end
00427                                      "#\.\.+#", # Remove double dots
00428                                      "#[{$sepQ}]+#", # Turn multiple separators into one
00429                                      "#^[{$prepost}]+|[{$prepost}]+$#" ),
00430                               array( $sep,
00431                                      $sep,
00432                                      $sep,
00433                                      $sep,
00434                                      "" ),
00435                               $text );
00436         return $text;
00437     }
00438 
00439     /**
00440      * Returns a shared instance of the eZCharTransform class.
00441      *
00442      * @return eZCharTransform
00443      */
00444     static function instance()
00445     {
00446         $instance =& $GLOBALS['eZCharTransformInstance'];
00447         if ( !isset( $instance ) )
00448         {
00449             $instance = new eZCharTransform();
00450         }
00451         return $instance;
00452     }
00453 }
00454 
00455 ?>