eZ Publish  [trunk]
ezcodemapper.php
Go to the documentation of this file.
00001 <?php
00002 /**
00003  * File containing the eZCodeMapper class.
00004  *
00005  * @copyright Copyright (C) 1999-2012 eZ Systems AS. All rights reserved.
00006  * @license http://www.gnu.org/licenses/gpl-2.0.txt GNU General Public License v2
00007  * @version //autogentag//
00008  * @package lib
00009  */
00010 
00011 /*!
00012   \class eZCodeMapper ezcodemapper.php
00013   \ingroup eZI18N
00014   \brief Handles mapping of character codes
00015 
00016 */
00017 
00018 class eZCodeMapper
00019 {
00020     const TYPE_DIRECT = 1;
00021     const TYPE_RANGE = 2;
00022     const TYPE_REPLACE = 3;
00023 
00024     /*!
00025      Constructor
00026     */
00027     function eZCodeMapper()
00028     {
00029         $this->TransformationTables = array();
00030         $this->TransformationFiles = array();
00031     }
00032 
00033     /*!
00034      \return The mapping table for identifier \a $identifier or \c false if it is not found.
00035     */
00036     function mappingTable( $identifier )
00037     {
00038         if ( isset( $this->TransformationTables[$identifier] ) )
00039             return $this->TransformationTables[$identifier];
00040         return false;
00041     }
00042 
00043     /*!
00044      \return An array with the names of rules which are currently available.
00045     */
00046     function ruleNames()
00047     {
00048         return array_keys( $this->TransformationTables );
00049     }
00050 
00051     /*!
00052      Outputs error \a $text found in parsed file at position \a $position.
00053     */
00054     function error( $text, $position = false )
00055     {
00056         if ( $position )
00057         {
00058             $str = $position['file'] . ':' . $position['from'][0] . ' C' . $position['from'][1];
00059             if ( isset( $position['to'] ) )
00060                 $str .= ' -> L' . $position['to'][0] . ' C' . $position['to'][1];
00061             $str .= ':';
00062         }
00063         $str .= $text;
00064         if ( class_exists( 'ezcli' ) )
00065         {
00066             $cli = eZCLI::instance();
00067             $cli->error( $str );
00068         }
00069         else
00070         {
00071             eZDebug::writeError( $str, __METHOD__ );
00072         }
00073     }
00074 
00075     /*!
00076      Outputs warning \a $text found in parsed file at position \a $position.
00077     */
00078     function warning( $text, $position = false )
00079     {
00080         if ( $position )
00081         {
00082             $str = $position['file'] . ':' . $position['from'][0] . ' C' . $position['from'][1];
00083             if ( isset( $position['to'] ) )
00084                 $str .= ' -> L' . $position['to'][0] . ' C' . $position['to'][1];
00085             $str .= ':';
00086         }
00087         $str .= $text;
00088         if ( class_exists( 'ezcli' ) )
00089         {
00090             $cli = eZCLI::instance();
00091             $cli->warning( $str );
00092         }
00093         else
00094         {
00095             eZDebug::writeWarning( $str, __METHOD__ );
00096         }
00097     }
00098 
00099     /*!
00100      \return \c true if the transformation file is already loaded.
00101     */
00102     function isTranformationLoaded( $name )
00103     {
00104         return in_array( $name, $this->TransformationFiles );
00105     }
00106 
00107     /*!
00108      Loads all transformation files defined in \c transform.ini to the current
00109      mapper. It will also load any transformations found in extensions.
00110 
00111      \param $currentCharset The name of the current charset in use. The caller must
00112                             make sure this is not an alias by using eZCharsetInfo::realCharsetCode()
00113      \param $transformationGroup The transformation group which is currently used or \c false for none.
00114     */
00115     function loadTransformationFiles( $currentCharset, $transformationGroup )
00116     {
00117         $ini = eZINI::instance( 'transform.ini' );
00118         $repositoryList = array( $ini->variable( 'Transformation', 'Repository' ) );
00119         $files = $ini->variable( 'Transformation', 'Files' );
00120         $extensions = $ini->variable( 'Transformation', 'Extensions' );
00121         $repositoryList = array_merge( $repositoryList,
00122                                        eZExtension::expandedPathList( $extensions, 'transformations' ) );
00123 
00124         // Check if the current charset maps to a unicode group
00125         // If it does it can trigger loading of additional files
00126         $unicodeGroups = array();
00127         $charsets = $ini->variable( 'Transformation', 'Charsets' );
00128         foreach ( $charsets as $entry )
00129         {
00130             list ( $charset, $group ) = explode( ';', $entry, 2 );
00131             $charset = eZCharsetInfo::realCharsetCode( $charset );
00132             if ( $charset == $currentCharset )
00133             {
00134                 if ( !in_array( $group, $unicodeGroups ) )
00135                     $unicodeGroups[] = $group;
00136             }
00137         }
00138 
00139         // If we are using transformation groups then add that as
00140         // a unicode group. This causes it load transformation files
00141         // specific to that group.
00142         if ( $transformationGroup !== false )
00143             $unicodeGroups[] = $transformationGroup;
00144 
00145         // Add any extra files from the unicode groups
00146         foreach ( $unicodeGroups as $unicodeGroup )
00147         {
00148             if ( $ini->hasGroup( $unicodeGroup ) )
00149             {
00150                 $files = array_merge( $files, $ini->variable( $unicodeGroup, 'Files' ) );
00151                 $extensions = $ini->variable( $unicodeGroup, 'Extensions' );
00152                 $repositoryList = array_merge( $repositoryList,
00153                                                eZExtension::expandedPathList( $extensions, 'transformations' ) );
00154             }
00155         }
00156 
00157         foreach ( $files as $file )
00158         {
00159             // Only load files that are not currently loaded
00160             if ( $this->isTranformationLoaded( $file ) )
00161                 continue;
00162 
00163             foreach ( $repositoryList as $repository )
00164             {
00165                 $trFile = $repository . '/' . $file;
00166                 if ( file_exists( $trFile ) )
00167                 {
00168                     $this->parseTransformationFile( $trFile, $file );
00169                 }
00170             }
00171         }
00172     }
00173 
00174     /*!
00175      Parses the transformation file \a $filename and appends any rules it finds
00176      to the current rule list.
00177      \param $name The name of transformation file as it was requested, ie. without a path
00178     */
00179     function parseTransformationFile( $filename, $name )
00180     {
00181 //         eZDebug::writeDebug( "Parsing file $filename" );
00182         $tbl = array();
00183 
00184         $fd = fopen( $filename, "rb" );
00185         if ( !$fd )
00186         {
00187             $this->error( "Failed opening $filename" );
00188             return false;
00189         }
00190 
00191         $this->TransformationFiles[] = $name;
00192 
00193         $this->ISOUnicodeCodec = eZTextCodec::instance( 'iso-8859-1', 'unicode' );
00194 
00195         $buffer = '';
00196         $lineNum = 1;
00197         $i = 0;
00198         $hexValues = "0123456789abcdefABCDEF";
00199         $identifier = false;
00200 
00201         // The big funky parser starts here
00202         // It starts by reading a chunk of data from the file
00203         // then splits everything into an array with lines.
00204         // Then it traverses one line at a time looking for
00205         // identifiers and rules. Comments will be removed before the
00206         // line is parsed for identifiers and rules.
00207 
00208         while ( !feof( $fd ) or strlen( $buffer ) > 0 )
00209         {
00210             $lines = array();
00211             $len = strlen( $buffer );
00212             // Check if we have data in the buffer yet
00213             // Note: The actual buffer reading is done at the end of this while loop
00214             if ( $len > 0 )
00215             {
00216                 $endPos = false;
00217                 $eolPos = 0;
00218                 // Look for complete lines and append to $lines
00219                 while ( $eolPos !== false and $eolPos < $len )
00220                 {
00221                     $eolPos = strpos( $buffer, "\n", $endPos );
00222                     if ( $eolPos !== false )
00223                     {
00224                         $line = substr( $buffer, $endPos, $eolPos - $endPos );
00225                         $lines[] = array( 'text' => $line,
00226                                           'line' => $lineNum );
00227                         ++$lineNum;
00228                         $endPos = $eolPos + 1;
00229                     }
00230                 }
00231 
00232                 // If we have leftover data place that back in $buffer
00233                 if ( $endPos !== false )
00234                 {
00235                     $buffer = substr( $buffer, $endPos );
00236                 }
00237             }
00238 
00239             // Once we have some lines start parsing them one at a time
00240             foreach ( $lines as $lineData )
00241             {
00242                 $line = $lineData['text'];
00243                 $lineOrg = $line;
00244                 $linePos = $lineData['line'];
00245                 $commentPos = strpos( $line, '#' );
00246                 $origLine = $line;
00247                 // Get rid of any comments before we check the line
00248                 if ( $commentPos !== false )
00249                 {
00250                     $line = substr( $line, 0, $commentPos );
00251                 }
00252                 $trimLine = trim( $line );
00253                 // Skip empty lines
00254                 if ( strlen( $trimLine ) == 0 )
00255                     continue;
00256 
00257 //                 print( "Line: '$line'\n" );
00258 
00259                 $unicodeData = false;
00260 
00261                 $sourceValue = false;
00262                 $sourceEndValue = false;
00263                 $destinationValues = false;
00264                 $transposeValue = false;
00265                 $transposeAdd = true;
00266                 $moduloValue = 1;
00267                 // source, marker, range_input, range_marker, map_input, transpose_input, replace_input
00268                 $state = 'source';
00269                 // map, transpose, replace
00270                 $type = false;
00271 
00272                 $len = strlen( $line );
00273                 if ( preg_match( '#^(.+):[ \t]*$#', $line, $matches ) )
00274                 {
00275                     $identifier = $matches[1];
00276                     if ( !preg_match( '#^[a-zA-Z_-][a-zA-Z0-9_-]*$#', $identifier ) )
00277                     {
00278                         $this->warning( "Invalid identifier '$identifier', can only contain a-z, a-Z - and _",
00279                                       array( 'file' => $filename, 'from' => array( $linePos, strlen( $identifier ) ) ) );
00280                         $identifier = false;
00281                         continue;
00282                     }
00283 //                     print( "identifier '$identifier'\n" );
00284                     continue;
00285                 }
00286                 else if ( $identifier === false )
00287                 {
00288                     $this->warning( "No identifier defined yet, skipping: '" . $line . "'",
00289                                     array( 'file' => $filename, 'from' => array( $linePos, 0 ) ) );
00290                     continue;
00291                 }
00292                 else
00293                 {
00294                     $pos = 0;
00295                     $col = 0;
00296                     $failed = false;
00297                     while ( $pos < $len )
00298                     {
00299                         while ( $pos < $len and
00300                                 ( $line[$pos] == ' ' or
00301                                   $line[$pos] == "\t" ) )
00302                         {
00303                             ++$pos;
00304                         }
00305                         if ( $pos >= $len )
00306                             break;
00307 
00308                         $char = $line[$pos];
00309                         $unicodeData = false;
00310                         if ( $char == '"' )
00311                         {
00312                             $delimiterPos = $pos;
00313                             while ( $delimiterPos < $len )
00314                             {
00315                                 $delimiterPos = strpos( $line, '"', $delimiterPos + 1 );
00316                                 if ( $delimiterPos === false or
00317                                      $delimiterPos <= $pos + 1 or
00318                                      $line[$delimiterPos - 1] != "\\" )
00319                                     break;
00320                             }
00321                             if ( $delimiterPos === false )
00322                             {
00323                                 $this->warning( "No end-quote found for line, skipping: '$line'",
00324                                                 array( 'file' => $filename,
00325                                                        'from' => array( $linePos, $pos ),
00326                                                        'to' => array( $linePos, strlen( $line ) ) ) );
00327                                 $pos = $len;
00328                                 $failed = true;
00329                                 break;
00330                             }
00331                             $str = str_replace( array( "\\\"", "\\\\" ),
00332                                                 array( "\"", "\\" ),
00333                                                 substr( $line, $pos + 1, $delimiterPos - $pos - 1 ) );
00334 //                             print( "string '$str'\n" );
00335                             $pos = $delimiterPos + 1;
00336                             $unicodeData = array( 'value' => $str,
00337                                                   'type' => 'string' );
00338                         }
00339                         else if ( $char == 'U' and
00340                              $pos + 1 < $len and
00341                              $line[$pos + 1] == '+' )
00342                         {
00343                             $hexPos = $pos + 2;
00344                             if ( $hexPos + 4 > $len )
00345                             {
00346                                 $col = $hexPos;
00347                                 $this->warning( "Found U+ value with " . ( 4 - ( $len - $hexPos ) ) . " missing hex numbers",
00348                                                 array( 'file' => $filename,
00349                                                        'from' => array( $linePos, $hexPos ) ) );
00350                                 $failed = true;
00351                                 $pos = $hexPos;
00352                                 break;
00353                             }
00354                             $hasHexValues = true;
00355                             for ( $offset = 0; $offset < 4; ++$offset )
00356                             {
00357                                 $hexChar = $line[$hexPos + $offset];
00358                                 if ( $hexChar == ' ' or
00359                                      $hexChar == "\t" )
00360                                 {
00361                                     $col = $hexPos + $offset;
00362                                     $hasHexValues = false;
00363                                     $this->warning( "Found U+ value with " . ( 4 - $offset ) . " missing hex numbers",
00364                                                     array( 'file' => $filename,
00365                                                            'from' => array( $linePos, $hexPos ),
00366                                                            'to' => array( $linePos, $hexPos + $offset ) ) );
00367                                     $failed = true;
00368                                     $pos = $hexPos + $offset;
00369                                     break;
00370                                 }
00371                                 if ( strpos( $hexValues, $hexChar ) === false )
00372                                 {
00373                                     $col = $hexPos + $offset;
00374                                     $hasHexValues = false;
00375                                     $this->warning( "Found U+ value with invalid hex numbers ($hexChar)",
00376                                                     array( 'file' => $filename,
00377                                                            'from' => array( $linePos, $hexPos ),
00378                                                            'to' => array( $linePos, $hexPos + $offset ) ) );
00379                                     $pos = $hexPos + $offset;
00380                                     $failed = true;
00381                                     break;
00382                                 }
00383                             }
00384                             if ( $failed )
00385                                 break;
00386                             if ( $hasHexValues )
00387                             {
00388                                 $unicodeValue = hexdec( substr( $line, $hexPos, 4 ) );
00389                                 $unicodeData = array( 'value' => $unicodeValue,
00390                                                       'type' => 'unicode' );
00391 //                                 print( "unicode U+ '$unicodeValue'\n" );
00392                             }
00393                             $pos = $hexPos + 4;
00394                         }
00395                         else if ( strpos( $hexValues, $char ) !== false and
00396                                   $pos + 1 < $len and
00397                                   strpos( $hexValues, $line[$pos + 1] ) !== false )
00398                         {
00399                             $hexPos = $pos;
00400                             if ( $hexPos + 2 > $len )
00401                             {
00402                                 $col = $len;
00403                                 $this->warning( "Found ASCII value with " . ( 2 - ( $len - $hexPos ) ) . " missing hex numbers",
00404                                                 array( 'file' => $filename,
00405                                                        'from' => array( $linePos, $hexPos ) ) );
00406                                 $pos = $hexPos;
00407                                 $failed = true;
00408                                 break;
00409                             }
00410                             $hasHexValues = true;
00411                             for ( $offset = 0; $offset < 2; ++$offset )
00412                             {
00413                                 $hexChar = $line[$hexPos + $offset];
00414                                 if ( $hexChar == ' ' or
00415                                      $hexChar == "\t" )
00416                                 {
00417                                     $col = $hexPos + $offset;
00418                                     $hasHexValues = false;
00419                                     $this->warning( "Found ASCII value with " . ( 2 - $offset ) . " missing hex numbers",
00420                                                     array( 'file' => $filename,
00421                                                            'from' => array( $linePos, $hexPos ),
00422                                                            'to' => array( $linePos, $hexPos + $offset ) ) );
00423                                     $pos = $hexPos + $offset;
00424                                     $failed = true;
00425                                     break;
00426                                 }
00427                                 if ( strpos( $hexValues, $hexChar ) === false )
00428                                 {
00429                                     $col = $hexPos + $offset;
00430                                     $hasHexValues = false;
00431                                     $this->warning( "Found ASCII value with invalid hex numbers ($hexChar)",
00432                                                     array( 'file' => $filename,
00433                                                            'from' => array( $linePos, $hexPos ),
00434                                                            'to' => array( $linePos, $hexPos + $offset ) ) );
00435                                     $pos = $hexPos + $offset;
00436                                     $failed = true;
00437                                     break;
00438                                 }
00439                             }
00440                             if ( $failed )
00441                                 break;
00442                             if ( $hasHexValues )
00443                             {
00444                                 $asciiValue = hexdec( substr( $line, $hexPos, 4 ) );
00445 //                                 print( "unicode ASCII '$asciiValue'\n" );
00446                                 $unicodeData = array( 'value' => $asciiValue,
00447                                                       'type' => 'ascii' );
00448                             }
00449                             $pos = $hexPos + 2;
00450                         }
00451                         else if ( substr( $line, $pos, 6 ) == 'remove' )
00452                         {
00453 //                             print( "remove character\n" );
00454                             $unicodeData = array( 'value' => false,
00455                                                   'type' => 'remove' );
00456                             $pos += 6;
00457                         }
00458                         else if ( substr( $line, $pos, 4 ) == 'keep' )
00459                         {
00460 //                             print( "keep character\n" );
00461                             $unicodeData = array( 'value' => true,
00462                                                   'type' => 'keep' );
00463                             $pos += 4;
00464                         }
00465 
00466                         if ( $unicodeData )
00467                         {
00468 //                             print( "data state: $state\n" );
00469                             // source, marker, range_input, range_marker, map_input, transpose_input, replace_input, transpose_modulo
00470                             if ( $state == 'source' )
00471                             {
00472                                 if ( $unicodeData['type'] == 'string' and
00473                                      strlen( $unicodeData['value'] ) > 1 )
00474                                 {
00475                                     $this->warning( "Text string with more than one character cannot be used as input value '" . $unicodeData['value'] . "'",
00476                                                     array( 'file' => $filename,
00477                                                            'from' => array( $linePos, $pos ) ) );
00478                                     $failed = true;
00479                                     break;
00480                                 }
00481                                 $sourceValue = $this->extractUnicodeValue( $unicodeData );
00482                                 $state = 'marker';
00483                             }
00484                             else if ( $state == 'marker' )
00485                             {
00486                                 $this->warning( "Source value not expected, a source value has already been extracted at $line" . "[$pos]",
00487                                                 array( 'file' => $filename,
00488                                                        'from' => array( $linePos, $pos ) ) );
00489                                 $failed = true;
00490                                 break;
00491                             }
00492                             else if ( $state == 'range_input' )
00493                             {
00494                                 if ( $unicodeData['type'] == 'string' and
00495                                      strlen( $unicodeData['value'] ) > 1 )
00496                                 {
00497                                     $this->warning( "Text string with more than one character cannot be used as range end value '" . $unicodeData['value'] . "'",
00498                                                     array( 'file' => $filename,
00499                                                            'from' => array( $linePos, $pos ) ) );
00500                                     $failed = true;
00501                                     break;
00502                                 }
00503                                 $sourceEndValue = $this->extractUnicodeValue( $unicodeData );
00504                                 $state = 'range_marker_or_modulo';
00505                             }
00506                             else if ( $state == 'range_marker_or_modulo' or
00507                                       $state == 'range_marker' )
00508                             {
00509                                 $this->warning( "Range value not expected, a range value has already been extracted at $line" . "[$pos]",
00510                                                 array( 'file' => $filename,
00511                                                        'from' => array( $linePos, $pos ) ) );
00512                                 $failed = true;
00513                                 break;
00514                             }
00515                             else if ( $state == 'map_input' )
00516                             {
00517                                 if ( !is_array( $destinationValues ) )
00518                                     $destinationValues = array();
00519                                 $destinationValues = array_merge( $destinationValues,
00520                                                                   $this->extractUnicodeValues( $unicodeData ) );
00521                                 $type = 'map';
00522                             }
00523                             else if ( $state == 'replace_input' )
00524                             {
00525                                 if ( !is_array( $destinationValues ) )
00526                                     $destinationValues = array();
00527                                 $destinationValues = array_merge( $destinationValues,
00528                                                                   $this->extractUnicodeValues( $unicodeData ) );
00529                                 $type = 'replace';
00530                             }
00531                             else if ( $state == 'transpose_input' )
00532                             {
00533                                 if ( $unicodeData['type'] == 'string' and
00534                                      strlen( $unicodeData['value'] ) > 1 )
00535                                 {
00536                                     $this->warning( "Text string with more than one character cannot be used as transpose value '" . $unicodeData['value'] . "'",
00537                                                     array( 'file' => $filename,
00538                                                            'from' => array( $linePos, $pos ) ) );
00539                                     $failed = true;
00540                                     break;
00541                                 }
00542                                 $transposeValue = $this->extractUnicodeValue( $unicodeData );
00543                                 $type = 'transpose';
00544                             }
00545                             else if ( $state == 'transpose_modulo' )
00546                             {
00547                                 if ( $unicodeData['type'] == 'string' and
00548                                      strlen( $unicodeData['value'] ) > 1 )
00549                                 {
00550                                     $this->warning( "Text string with more than one character cannot be used as transpose modulo value '" . $unicodeData['value'] . "'",
00551                                                     array( 'file' => $filename,
00552                                                            'from' => array( $linePos, $pos ) ) );
00553                                     $failed = true;
00554                                     break;
00555                                 }
00556                                 $moduloValue = $this->extractUnicodeValue( $unicodeData );
00557                                 if ( $moduloValue == 0 )
00558                                 {
00559                                     $this->error( "Modulo value of 0 is not allowed, 1 will be used instead",
00560                                                   array( 'file' => $filename,
00561                                                          'from' => array( $linePos, $pos ) ) );
00562                                     // Note: There is another 0 check in generateSimpleMappingTable()
00563                                 }
00564 //                                 print( "modulo value=$moduloValue\n" );
00565                                 $state = 'range_marker';
00566                             }
00567                         }
00568                         else if ( !$failed )
00569                         {
00570 //                             print( "command state: $state\n" );
00571                             // source, marker, range_input, range_marker, map_input, transpose_input, replace_input
00572                             if ( $state == 'source' )
00573                             {
00574                                 if ( $char == '=' )
00575                                 {
00576                                     $this->warning( "Cannot use map marker $char without prior character value",
00577                                                     array( 'file' => $filename,
00578                                                            'from' => array( $linePos, $pos ) ) );
00579                                     $failed = true;
00580                                     break;
00581                                 }
00582                                 else if ( $char == '+' or
00583                                           $char == '-' )
00584                                 {
00585                                     $this->warning( "Cannot use range marker $char without prior character value",
00586                                                     array( 'file' => $filename,
00587                                                            'from' => array( $linePos, $pos ) ) );
00588                                     $failed = true;
00589                                     break;
00590                                 }
00591                                 else
00592                                 {
00593                                     $this->warning( "Unknown character '$char', expecting input value",
00594                                                     array( 'file' => $filename,
00595                                                            'from' => array( $linePos, $pos ) ) );
00596                                     $failed = true;
00597                                     break;
00598                                 }
00599                             }
00600                             else if ( $state == 'marker' )
00601                             {
00602                                 if ( $char == '=' )
00603                                 {
00604                                     $state = 'map_input';
00605                                     ++$pos;
00606                                 }
00607                                 else if ( $char == '-' )
00608                                 {
00609                                     $state = 'range_input';
00610                                     ++$pos;
00611                                 }
00612                                 else if ( $char == '+' )
00613                                 {
00614                                     $this->warning( "Cannot use range marker $char without prior character value",
00615                                                     array( 'file' => $filename,
00616                                                            'from' => array( $linePos, $pos ) ) );
00617                                     $failed = true;
00618                                     break;
00619                                 }
00620                                 else
00621                                 {
00622                                     $this->warning( "Unknown character '$char', expecting marker",
00623                                                     array( 'file' => $filename,
00624                                                            'from' => array( $linePos, $pos ) ) );
00625                                     $failed = true;
00626                                     break;
00627                                 }
00628                             }
00629                             else if ( $state == 'range_marker_or_modulo' or
00630                                       $state == 'range_marker' )
00631                             {
00632                                 if ( $state == 'range_marker_or_modulo' and
00633                                      $char == '%' )
00634                                 {
00635 //                                     print( "found modulo marker\n" );
00636                                     // Look for modulo value
00637                                     $state = 'transpose_modulo';
00638                                     ++$pos;
00639                                 }
00640                                 else if ( $char == '=' )
00641                                 {
00642                                     $state = 'replace_input';
00643                                     ++$pos;
00644                                 }
00645                                 else if ( $char == '-' or
00646                                           $char == '+' )
00647                                 {
00648                                     $transposeAdd = ( $char == '+' ? true : false );
00649                                     $state = 'transpose_input';
00650                                     ++$pos;
00651                                 }
00652                                 else
00653                                 {
00654                                     $this->warning( "Unknown character '$char', expecting range end value",
00655                                                     array( 'file' => $filename,
00656                                                            'from' => array( $linePos, $pos ) ) );
00657                                     $failed = true;
00658                                     break;
00659                                 }
00660                             }
00661                             else if ( $state == 'map_input' )
00662                             {
00663                                 if ( $char == '=' )
00664                                 {
00665                                     $this->warning( "Duplicate mapping marker $char",
00666                                                     array( 'file' => $filename,
00667                                                            'from' => array( $linePos, $pos ) ) );
00668                                     $failed = true;
00669                                     break;
00670                                 }
00671                                 else if ( $char == '-' or
00672                                           $char == '+' )
00673                                 {
00674                                     $this->warning( "Already mapping values, cannot use range/transpose marker $char",
00675                                                     array( 'file' => $filename,
00676                                                            'from' => array( $linePos, $pos ) ) );
00677                                     $failed = true;
00678                                     break;
00679                                 }
00680                                 else
00681                                 {
00682                                     $this->warning( "Unknown character '$char', expecting output values",
00683                                                     array( 'file' => $filename,
00684                                                            'from' => array( $linePos, $pos ) ) );
00685                                     $failed = true;
00686                                     break;
00687                                 }
00688                             }
00689                             else if ( $state == 'transpose_modulo' )
00690                             {
00691                                 if ( $char == '%' )
00692                                 {
00693                                     $this->warning( "Modulo marker already used, cannot use $char",
00694                                                     array( 'file' => $filename,
00695                                                            'from' => array( $linePos, $pos ) ) );
00696                                     $failed = true;
00697                                     break;
00698                                 }
00699                                 else if ( $char == '-' or
00700                                           $char == '+' )
00701                                 {
00702                                     $this->warning( "Transpose marker $char used, but no modulo value has been found yet",
00703                                                     array( 'file' => $filename,
00704                                                            'from' => array( $linePos, $pos ) ) );
00705                                     $failed = true;
00706                                     break;
00707                                 }
00708                                 else
00709                                 {
00710                                     $this->warning( "Unknown character '$char', expecting modulo value",
00711                                                     array( 'file' => $filename,
00712                                                            'from' => array( $linePos, $pos ) ) );
00713                                     $failed = true;
00714                                     break;
00715                                 }
00716                             }
00717                             else if ( $state == 'transpose_input' )
00718                             {
00719                                 if ( $char == '=' )
00720                                 {
00721                                     $this->warning( "Already transposing, cannot use mapping marker $char",
00722                                                     array( 'file' => $filename,
00723                                                            'from' => array( $linePos, $pos ) ) );
00724                                     $failed = true;
00725                                     break;
00726                                 }
00727                                 else if ( $char == '-' or
00728                                           $char == '+' )
00729                                 {
00730                                     $this->warning( "Duplicate transpose marker $char",
00731                                                     array( 'file' => $filename,
00732                                                            'from' => array( $linePos, $pos ) ) );
00733                                     $failed = true;
00734                                     break;
00735                                 }
00736                                 else
00737                                 {
00738                                     $this->warning( "Unknown character '$char', expecting transpose value",
00739                                                     array( 'file' => $filename,
00740                                                            'from' => array( $linePos, $pos ) ) );
00741                                     $failed = true;
00742                                     break;
00743                                 }
00744                             }
00745                             else if ( $state == 'replace_input' )
00746                             {
00747                                 if ( $char == '=' )
00748                                 {
00749                                     $this->warning( "Already replacing, cannot use mapping marker $char",
00750                                                     array( 'file' => $filename,
00751                                                            'from' => array( $linePos, $pos ) ) );
00752                                     $failed = true;
00753                                     break;
00754                                 }
00755                                 else if ( $char == '-' or
00756                                           $char == '+' )
00757                                 {
00758                                     $this->warning( "Already replacing, cannot use transpose marker $char",
00759                                                     array( 'file' => $filename,
00760                                                            'from' => array( $linePos, $pos ) ) );
00761                                     $failed = true;
00762                                     break;
00763                                 }
00764                                 else
00765                                 {
00766                                     $this->warning( "Unknown character '$char', expecting replace value",
00767                                                     array( 'file' => $filename,
00768                                                            'from' => array( $linePos, $pos ) ) );
00769                                     $failed = true;
00770                                     break;
00771                                 }
00772                             }
00773                         }
00774                     }
00775                     if ( !$failed )
00776                     {
00777                         if ( $identifier )
00778                         {
00779 //                             print( "\nGot type '$type'\n" );
00780 //                            if ( is_array( $destinationValues ) )
00781 //                                $destinationValues = array_diff( $destinationValues, array( '' ) );
00782 
00783                             if ( !isset( $tbl[$identifier] ) )
00784                                 $tbl[$identifier] = array();
00785 
00786                             if ( $type == 'map' )
00787                             {
00788 //                                 print( "***mapping***:\n" . $sourceValue . ' => ' . implode( ', ', $destinationValues ) . "\n\n" );
00789                                 $this->appendDirectMapping( $tbl[$identifier], $identifier, $sourceValue, $destinationValues );
00790                             }
00791                             else if ( $type == 'replace' )
00792                             {
00793 //                                 print( "***replacing***:\n" . $sourceValue . ' - ' . $sourceEndValue . ' => ' . implode( ', ', $destinationValues ) . "\n\n" );
00794                                 $this->appendReplaceMapping( $tbl[$identifier], $identifier, $sourceValue, $sourceEndValue, $destinationValues );
00795                             }
00796                             else if ( $type == 'transpose' )
00797                             {
00798 //                                 print( "***transposing***:\n" . $sourceValue . ' - ' . $sourceEndValue . ' % ' . $moduloValue . ' + ' . $transposeValue . "\n\n" );
00799                                 $this->appendTransposeMapping( $tbl[$identifier], $identifier, $sourceValue, $sourceEndValue, $transposeValue, $transposeAdd, $moduloValue );
00800                             }
00801                         }
00802 //                         else
00803 //                         {
00804 //                             print( "No identifier found yet, skipping entry!!!!!!!!!!\n" );
00805 //                         }
00806                     }
00807                     else
00808                     {
00809 //                         $this->warning( "Failed adding mapper",
00810 //                                         array( 'file' => $filename,
00811 //                                                'from' => array( $linePos, $pos ) ) );
00812                     }
00813                 }
00814             }
00815 
00816             // Here we read more data from the file, appending to
00817             // the $buffer variable
00818             if ( !feof( $fd ) )
00819             {
00820                 $buffer .= fread( $fd, 4096 );
00821 
00822                 // Make sure we have Unix endline characters
00823                 $buffer = preg_replace( "#(\r\n|\r|\n)#", "\n", $buffer );
00824             }
00825             ++$i;
00826         }
00827 
00828         fclose( $fd );
00829 
00830         $this->TransformationTables = array_merge( $this->TransformationTables, $tbl );
00831     }
00832 
00833     /*!
00834      \private
00835      Appends a mapping from one value to another.
00836      \param $block Current block it is working on
00837      \param $identifier The current identifier it is working on
00838      \param $sourceValue The original value
00839      \param $destinationValues The value it should be mapped to
00840     */
00841     function appendDirectMapping( &$block, $identifier, $sourceValue, $destinationValues )
00842     {
00843         $count = count( $block );
00844         if ( count( $destinationValues ) == 1 )
00845             $destinationValues = array_pop( $destinationValues );
00846         if ( isset( $block[$count - 1] ) and
00847              $block[$count - 1][0] == self::TYPE_DIRECT and
00848              $block[$count - 1][2] == $identifier )
00849         {
00850             $block[$count - 1][1][$sourceValue] = $destinationValues;
00851         }
00852         else
00853         {
00854             $block[] = array( self::TYPE_DIRECT,
00855                               array( $sourceValue => $destinationValues ),
00856                               $identifier );
00857 
00858         }
00859     }
00860 
00861     /*!
00862      \private
00863      Appends a mapping for a range of values into a specific value
00864      \param $block Current block it is working on
00865      \param $identifier The current identifier it is working on
00866      \param $sourceValue The start of the original value
00867      \param $sourceEndValue The ned of the original value
00868      \param $destinationValues The value it should be mapped to
00869     */
00870     function appendReplaceMapping( &$block, $identifier, $sourceValue, $sourceEndValue, $destinationValues )
00871     {
00872         $count = count( $block );
00873         if ( count( $destinationValues ) == 1 )
00874             $destinationValues = array_pop( $destinationValues );
00875         if ( isset( $block[$count - 1] ) and
00876              $block[$count - 1][0] == self::TYPE_REPLACE and
00877              $block[$count - 1][2] == $identifier )
00878         {
00879             $block[$count - 1][1][] = array( $sourceValue, $sourceEndValue, $destinationValues );
00880         }
00881         else
00882         {
00883             $block[] = array( self::TYPE_REPLACE,
00884                               array( array( $sourceValue, $sourceEndValue, $destinationValues ) ),
00885                               $identifier );
00886 
00887         }
00888     }
00889 
00890     /*!
00891      \private
00892      Appends a mapping for characters by transposing them up or down.
00893      \param $block Current block it is working on
00894      \param $identifier The current identifier it is working on
00895      \param $sourceValue The start of the original value
00896      \param $sourceEndValue The ned of the original value
00897      \param $transposeValue How much to transpose the values
00898      \param $addValue If \c true the $transposeValue is added to the range if not it is subtracted.
00899     */
00900     function appendTransposeMapping( &$block, $identifier, $sourceValue, $sourceEndValue, $transposeValue, $addValue, $moduloValue )
00901     {
00902         $count = count( $block );
00903         if ( isset( $block[$count - 1] ) and
00904              $block[$count - 1][0] == self::TYPE_RANGE and
00905              $block[$count - 1][2] == $identifier )
00906         {
00907             $block[$count - 1][1][] = array( $sourceValue, $sourceEndValue, $addValue ? $transposeValue : -$transposeValue, $moduloValue );
00908         }
00909         else
00910         {
00911             $block[] = array( self::TYPE_RANGE,
00912                               array( array( $sourceValue, $sourceEndValue, $addValue ? $transposeValue : -$transposeValue, $moduloValue ) ),
00913                               $identifier );
00914 
00915         }
00916     }
00917 
00918     /*!
00919      \private
00920      \return The first unicod value for the data entry \a $data.
00921     */
00922     function extractUnicodeValue( $data )
00923     {
00924         $type = $data['type'];
00925         if ( $type == 'string' )
00926         {
00927             $list = $this->ISOUnicodeCodec->convertString( $data['value'][0] );
00928             return $list[0];
00929         }
00930         else if ( $type == 'ascii' )
00931         {
00932             return $data['value'];
00933         }
00934         else if ( $type == 'unicode' )
00935         {
00936             return $data['value'];
00937         }
00938         else if ( $type == 'remove' )
00939         {
00940             return false;
00941         }
00942         else if ( $type == 'keep' )
00943         {
00944             return true;
00945         }
00946         return null;
00947     }
00948 
00949     /*!
00950      \private
00951      \return The unicode values for the data entry \a $data.
00952     */
00953     function extractUnicodeValues( $data )
00954     {
00955         $type = $data['type'];
00956         if ( $type == 'string' )
00957         {
00958             return $this->ISOUnicodeCodec->convertString( $data['value'] );
00959         }
00960         else if ( $type == 'ascii' )
00961         {
00962             return array( $data['value'] );
00963         }
00964         else if ( $type == 'unicode' )
00965         {
00966             return array( $data['value'] );
00967         }
00968         else if ( $type == 'remove' )
00969         {
00970             return array( false );
00971         }
00972         else if ( $type == 'keep' )
00973         {
00974             return array( true );
00975         }
00976         return array();
00977     }
00978 
00979     /*!
00980      \private
00981      Goes trough all entries in \a $table and if it finds identifier references
00982      it will fetch the table for that identifier and merge in the current one.
00983      \return The expanded table.
00984     */
00985     function expandInheritance( $table )
00986     {
00987         $newTable = array();
00988         foreach ( $table as $tableItem )
00989         {
00990             if ( is_string( $tableItem ) )
00991             {
00992                 $identifier = $tableItem;
00993                 $subTable = $this->mappingTable( $identifier );
00994                 if ( !$subTable )
00995                 {
00996                     eZDebug::writeError( "Failed to fetch mapping table for identifier: '$identifier'" );
00997                 }
00998                 else
00999                 {
01000                     $subTable = $this->expandInheritance( $subTable );
01001                     $newTable = array_merge( $newTable, $subTable );
01002                 }
01003             }
01004             else
01005             {
01006                 $newTable[] = $tableItem;
01007             }
01008         }
01009         return $newTable;
01010     }
01011 
01012     /*!
01013      Turns the character list $list into an array with ordinal values
01014      \param $list Can be on of these types:
01015                   - String - each character is turned into an ordinal value
01016                   - Numeric - the numeric is used as ordinal value
01017                   - Boolean - means no character
01018                   - Array - each element is turned into an ordinal value by recursion
01019     */
01020     function ordinalValues( $table, $list )
01021     {
01022         $ordinals = array();
01023         if ( is_string( $list ) )
01024         {
01025             $len = strlen( $list );
01026             for ( $offset = 0; $offset < $len; ++$offset )
01027             {
01028                 $ordinals[] = ord( $list[$offset] );
01029             }
01030         }
01031         else if ( is_numeric( $list ) )
01032         {
01033             $ordinals[] = $list;
01034         }
01035         else if ( is_array( $list ) )
01036         {
01037             foreach ( $list as $item )
01038             {
01039                 $ordinals = array_merge( $ordinals, eZCodeMapper::ordinalValues( $table, $item ) );
01040             }
01041         }
01042         $ordinals = eZCodeMapper::mapOrdinals( $table, $ordinals );
01043         return $ordinals;
01044     }
01045 
01046     /*!
01047      Goes trough each ordinal in \a $ordinals and sees if there is mapping for it.
01048      If it is the mapping is applied and used as the new ordinal, if the mapping refers to
01049      an array it will be mapped recursively.
01050     */
01051     function mapOrdinals( $table, $ordinals )
01052     {
01053         $mappedOrdinals = array();
01054         foreach ( $ordinals as $ordinal )
01055         {
01056             while ( !is_array( $ordinal ) and isset( $table[$ordinal] ) )
01057             {
01058                 $ordinal = $table[$ordinal];
01059                 if ( is_array( $ordinal ) )
01060                 {
01061                     $ordinal = eZCodeMapper::mapOrdinals( $table, $ordinal );
01062                 }
01063             }
01064             if ( is_array( $ordinal ) )
01065                 $mappedOrdinals = array_merge( $mappedOrdinals, $ordinal );
01066             else
01067                 $mappedOrdinals[] = $ordinal;
01068         }
01069         return $mappedOrdinals;
01070     }
01071 
01072     /*!
01073      Goes trough all to codes in the mapping table \a $unicodeMap and maps
01074      those that match \a $fromCode into \a $toCode.
01075 
01076      \return \a $unicodeMap
01077     */
01078     protected function mapExistingCodes( $unicodeMap, $fromCode, $toCode )
01079     {
01080         foreach ( $unicodeMap as $from => $to )
01081         {
01082             if ( is_array( $to ) )
01083             {
01084                 $newTo = array();
01085                 foreach ( $to as $ordinal )
01086                 {
01087                     if ( $ordinal == $fromCode )
01088                     {
01089                         $newTo = array_merge( $newTo, array( $toCode ) );
01090                     }
01091                     else
01092                     {
01093                         $newTo[] = $ordinal;
01094                     }
01095                 }
01096                 $unicodeMap[$from] = $newTo;
01097             }
01098             else if ( $to == $fromCode )
01099             {
01100                 $unicodeMap[$from] = $toCode;
01101             }
01102         }
01103         return $unicodeMap;
01104     }
01105 
01106     /*!
01107      Goes trough the mapping rules in the table \a $table and generates a simple
01108      mapping table which maps from one Unicode value to another (or array of values).
01109 
01110      The generation uses backward and forward propagation of the defined mappings
01111      to get the proper end result of a given value.
01112 
01113      \note This method can take a while if lots of rules are used
01114     */
01115     function generateSimpleMappingTable( $table, $allowedRanges )
01116     {
01117         if ( !is_array( $table ) )
01118             return false;
01119         $unicodeMap = array();
01120         foreach ( $table as $tableItem )
01121         {
01122             $type = $tableItem[0];
01123             $item = $tableItem[1];
01124             if ( isset( $tableItem[2] ) )
01125             {
01126                 $identifier = $tableItem[2];
01127 //                print( "identifier: $identifier\n" );
01128             }
01129             if ( $type == self::TYPE_DIRECT )
01130             {
01131                 foreach ( $item as $fromCode => $toCode )
01132                 {
01133 //                    print( "from: $fromCode, to: $toCode\n" );
01134 //                     if ( $fromCode == 1026 )
01135 //                     {
01136 //                         print( "<pre>oldcode<br/>" ); var_dump( $toCode ); print( "</pre>" );
01137 //                     }
01138                     $toCode = eZCodeMapper::ordinalValues( $unicodeMap, $toCode );
01139 //                     if ( $fromCode == 1026 )
01140 //                     {
01141 //                         print( "<pre>newcode<br/>" ); var_dump( $toCode ); print( "</pre>" );
01142 //                     }
01143                     if ( count( $allowedRanges ) == 0 )
01144                     {
01145                         if ( count( $toCode ) == 1 )
01146                             $toCode = $toCode[0];
01147                         // If the mapping already exists we skip it
01148                         if ( isset( $unicodeMap[$fromCode] ) )
01149                             continue;
01150 
01151                         $unicodeMap[$fromCode] = $toCode;
01152                         $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $fromCode, $toCode );
01153                     }
01154                     else
01155                     {
01156                         $allowed = false;
01157                         foreach ( $allowedRanges as $allowedRange )
01158                         {
01159                             if ( $fromCode >= $allowedRange[0] and
01160                                  $fromCode <= $allowedRange[1] )
01161                             {
01162                                 $allowed = true;
01163                                 break;
01164                             }
01165                         }
01166                         if ( !$allowed )
01167                             continue;
01168 
01169                         $toCodeList = $toCode;
01170                         $newToCodeList = array();
01171                         foreach ( $toCodeList as $toCode )
01172                         {
01173                             if ( is_bool( $toCode ) )
01174                             {
01175                                 $newToCodeList[] = $toCode;
01176                                 continue;
01177                             }
01178                             foreach ( $allowedRanges as $allowedRange )
01179                             {
01180                                 if ( $toCode >= $allowedRange[0] and
01181                                      $toCode <= $allowedRange[1] )
01182                                 {
01183                                     break;
01184                                 }
01185                             }
01186                             if ( $allowed )
01187                             {
01188                                 $newToCodeList[] = $toCode;
01189                             }
01190                         }
01191                         $toCode = $newToCodeList;
01192                         if ( count( $toCode ) > 0 )
01193                         {
01194                             if ( count( $toCode ) == 1 )
01195                                 $toCode = $toCode[0];
01196 
01197                             // If the mapping already exists we skip it
01198                             if ( isset( $unicodeMap[$fromCode] ) )
01199                                 continue;
01200 
01201                             $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $fromCode, $toCode );
01202 
01203                             $unicodeMap[$fromCode] = $toCode;
01204                         }
01205                     }
01206                 }
01207             }
01208             else if ( $type == self::TYPE_RANGE )
01209             {
01210                 foreach ( $item as $rangeItem )
01211                 {
01212                     $start = $rangeItem[0];
01213                     $stop = $rangeItem[1];
01214                     if ( $start > $stop )
01215                     {
01216                         $tmp = $stop;
01217                         $stop = $start;
01218                         $start = $tmp;
01219                     }
01220                     $add = $rangeItem[2];
01221                     $modulo = $rangeItem[3];
01222                     // Sanity-check, to avoid infinite loops
01223                     if ( $modulo == 0 )
01224                         $modulo = 1;
01225                     for ( $i = $start; $i <= $stop; $i += $modulo )
01226                     {
01227                         if ( count( $allowedRanges ) == 0 )
01228                         {
01229                             $allowed = true;
01230                         }
01231                         else
01232                         {
01233                             $allowed = false;
01234                             foreach ( $allowedRanges as $allowedRange )
01235                             {
01236                                 if ( $i >= $allowedRange[0] and
01237                                      $i <= $allowedRange[1] )
01238                                 {
01239                                     $allowed = true;
01240                                     break;
01241                                 }
01242                             }
01243                             if ( !$allowed )
01244                                 continue;
01245                         }
01246 
01247                         $replace = $i + $add;
01248                         $replace = eZCodeMapper::ordinalValues( $unicodeMap, $replace );
01249                         if ( count( $allowedRanges ) == 0 )
01250                         {
01251                             if ( count( $replace ) == 0 )
01252                                 $replace = false;
01253                             else if ( count( $replace ) == 1 )
01254                                 $replace = $replace[0];
01255                             $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace );
01256 
01257                             // If the mapping already exists we skip it
01258                             if ( isset( $unicodeMap[$i] ) )
01259                                 continue;
01260 
01261                             $unicodeMap[$i] = $replace;
01262                         }
01263                         else
01264                         {
01265                             $newReplace = array();
01266                             foreach ( $allowedRanges as $allowedRange )
01267                             {
01268                                 foreach ( $replace as $replaceOrdinal )
01269                                 {
01270                                     if ( $replaceOrdinal >= $allowedRange[0] and
01271                                          $replaceOrdinal <= $allowedRange[1] )
01272                                     {
01273                                         $newReplace[] = $replaceOrdinal;
01274                                     }
01275                                 }
01276                             }
01277                             if ( count( $newReplace ) == 0 )
01278                                 $replace = false;
01279                             else if ( count( $newReplace ) == 1 )
01280                                 $replace = $newReplace[0];
01281                             else
01282                                 $replace = $newReplace;
01283 
01284                             // If the mapping already exists we skip it
01285                             if ( isset( $unicodeMap[$i] ) )
01286                                 continue;
01287 
01288                             $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace );
01289                             $unicodeMap[$i] = $replace;
01290                         }
01291                     }
01292                 }
01293             }
01294             else if ( $type == self::TYPE_REPLACE )
01295             {
01296                 foreach ( $item as $rangeItem )
01297                 {
01298                     $start = $rangeItem[0];
01299                     $stop = $rangeItem[1];
01300                     if ( $start > $stop )
01301                     {
01302                         $tmp = $stop;
01303                         $stop = $start;
01304                         $start = $tmp;
01305                     }
01306                     $replace = $rangeItem[2];
01307                     $replace = eZCodeMapper::ordinalValues( $unicodeMap, $replace );
01308                     if ( count( $allowedRanges ) == 0 )
01309                     {
01310                         if ( count( $replace ) == 0 )
01311                             $replace = false;
01312                         else if ( count( $replace ) == 1 )
01313                             $replace = $replace[0];
01314                         for ( $i = $start; $i <= $stop; ++$i )
01315                         {
01316                             // If the mapping already exists we skip it
01317                             if ( isset( $unicodeMap[$i] ) )
01318                                 continue;
01319 
01320                             $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace );
01321                             $unicodeMap[$i] = $replace;
01322                         }
01323                     }
01324                     else
01325                     {
01326                         $newReplace = array();
01327                         foreach ( $allowedRanges as $allowedRange )
01328                         {
01329                             foreach ( $replace as $replaceOrdinal )
01330                             {
01331                                 if ( $replaceOrdinal >= $allowedRange[0] and
01332                                      $replaceOrdinal <= $allowedRange[1] )
01333                                 {
01334                                     $newReplace[] = $replaceOrdinal;
01335                                 }
01336                             }
01337                         }
01338                         if ( count( $newReplace ) == 0 )
01339                             $replace = false;
01340                         else if ( count( $newReplace ) == 1 )
01341                             $replace = $newReplace[0];
01342                         else
01343                             $replace = $newReplace;
01344                         for ( $i = $start; $i <= $stop; ++$i )
01345                         {
01346                             $allowed = false;
01347                             foreach ( $allowedRanges as $allowedRange )
01348                             {
01349                                 if ( $i >= $allowedRange[0] and
01350                                      $i <= $allowedRange[1] )
01351                                 {
01352                                     $allowed = true;
01353                                     break;
01354                                 }
01355                             }
01356                             if ( $allowed )
01357                             {
01358                                 // If the mapping already exists we skip it
01359                                 if ( isset( $unicodeMap[$i] ) )
01360                                     continue;
01361 
01362                                 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace );
01363                                 $unicodeMap[$i] = $replace;
01364                             }
01365                         }
01366                     }
01367                 }
01368             }
01369         }
01370         return $unicodeMap;
01371     }
01372 
01373     /*!
01374      Generates a unicode mapping table for idenfier \a $idenfier.
01375 
01376      \param $identifier Is either a single identifier string or a
01377                         an array with identifiers.
01378      \return The unicode mapping table for all defined identifiers
01379     */
01380     function generateMappingCode( $identifier )
01381     {
01382         if ( !is_array( $identifier ) )
01383             $identifier = array( $identifier );
01384         $table = $this->expandInheritance( $identifier );
01385 
01386         // We allow all characters for now
01387         $allowedRanges = array();
01388         $simpleTable = $this->generateSimpleMappingTable( $table, $allowedRanges );
01389         ksort( $simpleTable );
01390         return $simpleTable;
01391     }
01392 
01393     /*!
01394      Generates a mapping table for the character set $charset.
01395      This will mapping table will only work for that character set but will be much faster
01396      and be fed directly to the strtr() PHP function.
01397      \return the table or \c false if something failed.
01398     */
01399     function generateCharsetMappingTable( $unicodeTable, $charset )
01400     {
01401         $codec = eZTextCodec::instance( 'unicode', $charset );
01402         if ( !$codec )
01403         {
01404             eZDebug::writeError( "Failed to create textcodec for charset '$charset'" );
01405             return false;
01406         }
01407 
01408         $charsetTable = array();
01409         foreach ( $unicodeTable as $match => $replacement )
01410         {
01411             $matchLocal = $codec->convertString( array( $match ) );
01412             if ( is_array( $replacement ) )
01413             {
01414                 $replacementLocal = $codec->convertString( $replacement );
01415             }
01416             else
01417             {
01418                 $replacementLocal = $codec->convertString( array( $replacement ) );
01419             }
01420             $charsetTable[$matchLocal] = $replacementLocal;
01421         }
01422 
01423         // Make sure longer string entries are placed before the shorter ones
01424         // This is very important when working with utf8 which have
01425         // variable length for characters
01426         krsort( $charsetTable );
01427         return $charsetTable;
01428     }
01429 
01430     /*!
01431      Decodes a command into transformation rules.
01432      \param $name Name of the command
01433      \param $parameters Array of parameters for the command
01434      \return An array with transformation rules.
01435     */
01436     function decodeCommand( $name, $parameters )
01437     {
01438         $names = $this->ruleNames();
01439         $rules = array();
01440         switch ( $name )
01441         {
01442             // Special code handlers
01443             case 'url_cleanup_iri':
01444             case 'url_cleanup':
01445             case 'url_cleanup_compat':
01446             case 'identifier_cleanup':
01447             {
01448             } break;
01449 
01450             case 'normalize':
01451             case 'search_normalize':
01452             case 'decompose':
01453             case 'diacritical':
01454             case 'lowercase':
01455             case 'uppercase':
01456             case 'search_cleanup':
01457             {
01458                 if ( count( $parameters ) == 0 )
01459                 {
01460                     // Include all normalize rules
01461                     foreach ( $names as $rule )
01462                     {
01463                         if ( preg_match( '#_'. $name . '$#', $rule ) )
01464                             $rules[] = $rule;
01465                     }
01466                 }
01467                 else
01468                 {
01469                     foreach ( $parameters as $parameter )
01470                     {
01471                         $rule = $parameter . '_' . $name;
01472                         if ( in_array( $rule, $names ) )
01473                             $rules[] = $rule;
01474                     }
01475                 }
01476             } break;
01477 
01478             case 'transform':
01479             case 'transliterate':
01480             {
01481                 $dividers = array( 'transform' => '_to_',
01482                                    'transliterate' => '_transliterate_' );
01483                 $divider = $dividers[$name];
01484                 if ( count( $parameters ) == 0 )
01485                 {
01486                     // Include all transformation rules
01487                     foreach ( $names as $rule )
01488                     {
01489                         if ( preg_match( '#^[a-zA-Z][a-zA-Z0-9-]+'. $divider . '[a-zA-Z][a-zA-Z0-9-]+$#', $rule ) )
01490                             $rules[] = $rule;
01491                     }
01492                 }
01493                 else if ( count( $parameters ) == 2 )
01494                 {
01495                     $rule = $parameters[0] . $divider . $parameters[1];
01496                     if ( in_array( $rule, $names ) )
01497                         $rules[] = $rule;
01498                 }
01499             } break;
01500 
01501             default:
01502             {
01503                 $ini = eZINI::instance( 'transform.ini' );
01504                 $commands = $ini->variable( 'Extensions', 'Commands' );
01505                 if ( isset( $commands[$name] ) )
01506                 {
01507                     break;
01508                 }
01509                 eZDebug::writeError( "Unknown command '$name'", __METHOD__ );
01510             } break;
01511         }
01512         return $rules;
01513     }
01514 
01515     /*!
01516      Generates PHP code for the command \a $command.
01517      \param $charsetName The name of the charset the text will be in,
01518                          this can be used to generate different code for different charsets.
01519      \return A string containing PHP code or \c false if not supported.
01520     */
01521     function generateCommandCode( $command, $charsetName )
01522     {
01523         if ( $command['command'] == 'url_cleanup_iri' )
01524         {
01525             $charsetNameTxt = var_export( $charsetName, true );
01526             $code = "\$text = eZCharTransform::commandUrlCleanupIRI( \$text, $charsetNameTxt );\n";
01527             return $code;
01528         }
01529         else if ( $command['command'] == 'url_cleanup' )
01530         {
01531             $charsetNameTxt = var_export( $charsetName, true );
01532             $code = "\$text = eZCharTransform::commandUrlCleanup( \$text, $charsetNameTxt );\n";
01533             return $code;
01534         }
01535         else if ( $command['command'] == 'url_cleanup_compat' )
01536         {
01537             $charsetNameTxt = var_export( $charsetName, true );
01538             $code = "\$text = eZCharTransform::commandUrlCleanupCompat( \$text, $charsetNameTxt );\n";
01539             return $code;
01540         }
01541         else if ( $command['command'] == 'identifier_cleanup' )
01542         {
01543             $code = ( "\$text = strtolower( \$text );\n" .
01544                       "\$text = preg_replace( array( \"#[^a-z0-9_ ]#\",\n" .
01545                       "                             \"/ /\",\n" .
01546                       "                             \"/__+/\",\n" .
01547                       "                             \"/^_|_$/\" ),\n" .
01548                       "                      array( \" \",\n" .
01549                       "                             \"_\",\n" .
01550                       "                             \"_\",\n" .
01551                       "                             \"\" ),\n" .
01552                       "                      \$text );\n" );
01553             return $code;
01554         }
01555         else if ( $command['command'] == 'search_cleanup' )
01556         {
01557             $code = '';
01558             $nonCJKCharsets = $this->nonCJKCharsets();
01559             if ( !in_array( $charsetName, $nonCJKCharsets ) )
01560             {
01561                 $code .= ( '// add N-Gram(N=2)  chinese / japanese / korean multibyte characters' . "\n" .
01562                            '$codec = eZTextCodec::instance( false, \'unicode\' );' . "\n" .
01563                            "\n" .
01564                            '$unicodeValueArray = $codec->convertString( $text );' . "\n" .
01565                            "\n" .
01566                            '$normalizedTextArray = array();' . "\n" .
01567                            '$bFlag = false;' . "\n" .
01568                           'foreach ( array_keys( $unicodeValueArray ) as $valueKey )' . "\n" .
01569                            '{' . "\n" .
01570                            '    // Check for word characters that should be broken up for search' . "\n" .
01571                            '    if ( ( $unicodeValueArray[$valueKey] >= 12289 and' . "\n" .
01572                            '           $unicodeValueArray[$valueKey] <= 12542 ) or' . "\n" .
01573                            '         ( $unicodeValueArray[$valueKey] >= 13312 and' . "\n" .
01574                            '           $unicodeValueArray[$valueKey] <= 40863 ) or' . "\n" .
01575                            '         ( $unicodeValueArray[$valueKey] >= 44032 and' . "\n" .
01576                            '           $unicodeValueArray[$valueKey] <= 55203 ) )' . "\n" .
01577                            '    {' . "\n" .
01578                            '        if ( $bFlag )' . "\n" .
01579                            '        {' . "\n" .
01580                            '            $normalizedTextArray[] = $unicodeValueArray[$valueKey];' . "\n" .
01581                            '        }' . "\n" .
01582                            '        $normalizedTextArray[] = 32; // A space' . "\n" .
01583                            '        $normalizedTextArray[] = $unicodeValueArray[$valueKey];' . "\n" .
01584                            '        $bFlag = true;' . "\n" .
01585                            '    }' . "\n" .
01586                            '    else' . "\n" .
01587                            '    {' . "\n" .
01588                            '        if ( $bFlag )' . "\n" .
01589                            '        {' . "\n" .
01590                            '            $normalizedTextArray[] = 32; // A space' . "\n" .
01591                            '        }' . "\n" .
01592                            '        $normalizedTextArray[] = $unicodeValueArray[$valueKey];' . "\n" .
01593                            '        $bFlag = false;' . "\n" .
01594                            '    }' . "\n" .
01595                            '}' . "\n" .
01596                            'if ( $bFlag )' . "\n" .
01597                            '{' . "\n" .
01598                            '    $normalizedTextArray[count($normalizedTextArray)-1]=32;' . "\n" .
01599                            '}' . "\n" .
01600                            '$revCodec = eZTextCodec::instance( \'unicode\', false ); // false means use internal charset' . "\n" .
01601                            '$text = $revCodec->convertString( $normalizedTextArray );' . "\n" );
01602             }
01603             $code .= ( '$text = preg_replace( array( "#(\.){2,}#",' . "\n" .
01604                        '                             "#^\.#",' . "\n" .
01605                        '                             "#\s\.#",' . "\n" .
01606                        '                             "#\.\s#",' . "\n" .
01607                        '                             "#\.$#",' . "\n" .
01608                        '                             "#([^0-9])%#" ),' . "\n" .
01609                        '                      array( " ",' . "\n" .
01610                        '                             " ",' . "\n" .
01611                        '                             " ",' . "\n" .
01612                        '                             " ",' . "\n" .
01613                        '                             " ",' . "\n" .
01614                        '                             " " ),' . "\n" .
01615                        '                      $text );' . "\n" .
01616                        '$ini = eZINI::instance();' . "\n" .
01617                        'if ( $ini->variable( \'SearchSettings\', \'EnableWildcard\' ) != \'true\' )' . "\n" .
01618                        '{' . "\n" .
01619                        '    $text = str_replace( "*", " ", $text );' . "\n" .
01620                        '}' . "\n" .
01621                        '$charset = eZTextCodec::internalCharset();' . "\n" .
01622                        '$hasUTF8 = ( $charset == "utf-8" );' . "\n" .
01623                        "\n" .
01624                        'if ( $hasUTF8 )' . "\n" .
01625                        '{' . "\n" .
01626                        '    $text = preg_replace( "#(\s+)#u", " ", $text );' . "\n" .
01627                        '}' . "\n" .
01628                        'else' . "\n" .
01629                        '{' . "\n" .
01630                        '    $text = preg_replace( "#(\s+)#", " ", $text );' . "\n" .
01631                        '}' );
01632 
01633             return $code;
01634         }
01635         else
01636         {
01637             $ini = eZINI::instance( 'transform.ini' );
01638             $commands = $ini->variable( 'Extensions', 'Commands' );
01639             if ( isset( $commands[$command['command']] ) )
01640             {
01641                 list( $path, $className ) = explode( ':', $commands[$command['command']], 2 );
01642                 if ( file_exists( $path ) )
01643                 {
01644                     $charsetNameTxt = var_export( $charsetName, true );
01645                     $commandTxt     = var_export( $command['command'], true );
01646                     $pathTxt        = var_export( $path, true );
01647                     $code = "include_once( $pathTxt );\n\$text = $className::executeCommand( \$text, $commandTxt, $charsetNameTxt );\n";
01648                     return $code;
01649                 }
01650                 else
01651                 {
01652                     eZDebug::writeError( "Could not locate include file '$path' for transformation '" . $command['command'] . "'" );
01653                 }
01654             }
01655         }
01656         return false;
01657     }
01658 
01659     /*!
01660      Executes custom PHP code for the command \a $command.
01661      \param $charsetName The name of the charset the text will be in,
01662                          this can be used to execute different code for different charsets.
01663      \return \c true if the command is supported, \c false otherwise.
01664     */
01665     function executeCommandCode( &$text, $command, $charsetName )
01666     {
01667         if ( $command['command'] == 'url_cleanup_iri' )
01668         {
01669             $text = eZCharTransform::commandUrlCleanupIRI( $text, $charsetName );
01670             return true;
01671         }
01672         else if ( $command['command'] == 'url_cleanup' )
01673         {
01674             $text = eZCharTransform::commandUrlCleanup( $text, $charsetName );
01675             return true;
01676         }
01677         else if ( $command['command'] == 'url_cleanup_compat' )
01678         {
01679             $text = eZCharTransform::commandUrlCleanupCompat( $text, $charsetName );
01680             return true;
01681         }
01682         else if ( $command['command'] == 'identifier_cleanup' )
01683         {
01684             $text = strtolower( $text );
01685             $text = preg_replace( array( "#[^a-z0-9_ ]#",
01686                                          "/ /",
01687                                          "/__+/",
01688                                          "/^_|_$/" ),
01689                                   array( " ",
01690                                          "_",
01691                                          "_",
01692                                          "" ),
01693                                   $text );
01694             return true;
01695         }
01696         else if ( $command['command'] == 'search_cleanup' )
01697         {
01698             $nonCJKCharsets = $this->nonCJKCharsets();
01699             if ( !in_array( $charsetName, $nonCJKCharsets ) )
01700             {
01701                 // 4 Add spaces after chinese / japanese / korean multibyte characters
01702                 $codec = eZTextCodec::instance( false, 'unicode' );
01703 
01704                 $unicodeValueArray = $codec->convertString( $text );
01705 
01706                 $normalizedTextArray = array();
01707                 $bFlag = false;
01708                 foreach ( array_keys( $unicodeValueArray ) as $valueKey )
01709                 {
01710                     // Check for word characters that should be broken up for search
01711                     if ( ( $unicodeValueArray[$valueKey] >= 12289 and
01712                            $unicodeValueArray[$valueKey] <= 12542 ) or
01713                          ( $unicodeValueArray[$valueKey] >= 13312 and
01714                            $unicodeValueArray[$valueKey] <= 40863 ) or
01715                          ( $unicodeValueArray[$valueKey] >= 44032 and
01716                            $unicodeValueArray[$valueKey] <= 55203 ) )
01717                     {
01718                         if ( $bFlag )
01719                         {
01720                             $normalizedTextArray[] = $unicodeValueArray[$valueKey];
01721                         }
01722                         $normalizedTextArray[] = 32; // A space
01723                         $normalizedTextArray[] = $unicodeValueArray[$valueKey];
01724                         $bFlag = true;
01725                     }
01726                     else
01727                     {
01728                         if ( $bFlag )
01729                         {
01730                             $normalizedTextArray[] = 32; // A space
01731                         }
01732                         $normalizedTextArray[] = $unicodeValueArray[$valueKey];
01733                         $bFlag = false;
01734                     }
01735                 }
01736 
01737                 if ( $bFlag )
01738                 {
01739                     $normalizedTextArray[ count( $normalizedTextArray ) - 1 ] = 32;
01740                 }
01741 
01742                 $revCodec = eZTextCodec::instance( 'unicode', false ); // false means use internal charset
01743                 $text = $revCodec->convertString( $normalizedTextArray );
01744             }
01745 
01746             // Make sure dots inside words/numbers are kept, the rest is turned into space
01747             $text = preg_replace( array( "#(\.){2,}#",
01748                                          "#^\.#",
01749                                          "#\s\.#",
01750                                          "#\.\s#",
01751                                          "#\.$#",
01752                                          "#([^0-9])%#" ), // Keep only % after a number
01753                                   array( " ",
01754                                          " ",
01755                                          " ",
01756                                          " ",
01757                                          " ",
01758                                          "$1 " ),
01759                                   $text );
01760             $ini = eZINI::instance();
01761             if ( $ini->variable( 'SearchSettings', 'EnableWildcard' ) != 'true' )
01762             {
01763                 $text = str_replace( "*", " ", $text );
01764             }
01765             $charset = eZTextCodec::internalCharset();
01766             $hasUTF8 = ( $charset == "utf-8" );
01767 
01768             if ( $hasUTF8 )
01769             {
01770                 $text = preg_replace( "#(\s+)#u", " ", $text );
01771             }
01772             else
01773             {
01774                 $text = preg_replace( "#(\s+)#", " ", $text );
01775             }
01776 
01777             return true;
01778         }
01779         else
01780         {
01781             $ini = eZINI::instance( 'transform.ini' );
01782             $commands = $ini->variable( 'Extensions', 'Commands' );
01783             if ( isset( $commands[$command['command']] ) )
01784             {
01785                 list( $path, $className ) = explode( ':', $commands[$command['command']], 2 );
01786                 if ( file_exists( $path ) )
01787                 {
01788                     include_once( $path );
01789                     $text = call_user_func_array( array( $className, 'executeCommand' ),
01790                                                   array( $text, $command['command'], $charsetName ) );
01791                     return true;
01792                 }
01793                 else
01794                 {
01795                     eZDebug::writeError( "Could not locate include file '$path' for transformation '" . $command['command'] . "'" );
01796                 }
01797             }
01798         }
01799         return false;
01800     }
01801 
01802     /*!
01803      \return An array with charsets that are certain to not contain CJK characters.
01804     */
01805     function nonCJKCharsets()
01806     {
01807         return array( 'adobe-standard-encoding',
01808                       'cp437', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857',
01809                       'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866',
01810                       'cp869', 'cp874',
01811                       'dec-mcs', 'hp-roman8',
01812                       'iso-8859-1', 'iso-8859-2', 'iso-8859-3', 'iso-8859-4', 'iso-8859-5',
01813                       'iso-8859-6', 'iso-8859-7', 'iso-8859-8', 'iso-8859-9', 'iso-8859-10',
01814                       'iso-8859-11', 'iso-8859-13', 'iso-8859-14', 'iso-8859-15',
01815                       'koi8-r', 'koi8-u', 'macintosh', 'next', 'us-ascii',
01816                       'windows-1250', 'windows-1251', 'windows-1252', 'windows-1253',
01817                       'windows-1254', 'windows-1255', 'windows-1256', 'windows-1257',
01818                       'windows-1258' );
01819     }
01820 
01821     /// \privatesection
01822     public $TransformationTables;
01823     public $TransformationFiles;
01824     public $ISOUnicodeCodec;
01825 }
01826 
01827 ?>