eZ Publish  [4.0]
ezcodemapper.php
Go to the documentation of this file.
00001 <?php
00002 //
00003 // Definition of eZCodeMapper class
00004 //
00005 // Created on: <18-Jun-2004 14:56:15 amos>
00006 //
00007 // ## BEGIN COPYRIGHT, LICENSE AND WARRANTY NOTICE ##
00008 // SOFTWARE NAME: eZ Publish
00009 // SOFTWARE RELEASE: 4.0.x
00010 // COPYRIGHT NOTICE: Copyright (C) 1999-2008 eZ Systems AS
00011 // SOFTWARE LICENSE: GNU General Public License v2.0
00012 // NOTICE: >
00013 //   This program is free software; you can redistribute it and/or
00014 //   modify it under the terms of version 2.0  of the GNU General
00015 //   Public License as published by the Free Software Foundation.
00016 //
00017 //   This program is distributed in the hope that it will be useful,
00018 //   but WITHOUT ANY WARRANTY; without even the implied warranty of
00019 //   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020 //   GNU General Public License for more details.
00021 //
00022 //   You should have received a copy of version 2.0 of the GNU General
00023 //   Public License along with this program; if not, write to the Free
00024 //   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00025 //   MA 02110-1301, USA.
00026 //
00027 //
00028 // ## END COPYRIGHT, LICENSE AND WARRANTY NOTICE ##
00029 //
00030 
00031 /*! \file ezcodemapper.php
00032 */
00033 
00034 /*!
00035   \class eZCodeMapper ezcodemapper.php
00036   \ingroup eZI18N
00037   \brief Handles mapping of character codes
00038 
00039 */
00040 
00041 class eZCodeMapper
00042 {
00043     const TYPE_DIRECT = 1;
00044     const TYPE_RANGE = 2;
00045     const TYPE_REPLACE = 3;
00046 
00047     /*!
00048      Constructor
00049     */
00050     function eZCodeMapper()
00051     {
00052         $this->TransformationTables = array();
00053         $this->TransformationFiles = array();
00054     }
00055 
00056     /*!
00057      \return The mapping table for identifier \a $identifier or \c false if it is not found.
00058     */
00059     function mappingTable( $identifier )
00060     {
00061         if ( isset( $this->TransformationTables[$identifier] ) )
00062             return $this->TransformationTables[$identifier];
00063         return false;
00064     }
00065 
00066     /*!
00067      \return An array with the names of rules which are currently available.
00068     */
00069     function ruleNames()
00070     {
00071         return array_keys( $this->TransformationTables );
00072     }
00073 
00074     /*!
00075      Outputs error \a $text found in parsed file at position \a $position.
00076     */
00077     function error( $text, $position = false )
00078     {
00079         if ( $position )
00080         {
00081             $str = $position['file'] . ':' . $position['from'][0] . ' C' . $position['from'][1];
00082             if ( isset( $position['to'] ) )
00083                 $str .= ' -> L' . $position['to'][0] . ' C' . $position['to'][1];
00084             $str .= ':';
00085         }
00086         $str .= $text;
00087         if ( class_exists( 'ezcli' ) )
00088         {
00089             //include_once( 'lib/ezutils/classes/ezcli.php' );
00090             $cli = eZCLI::instance();
00091             $cli->error( $str );
00092         }
00093         else
00094         {
00095             eZDebug::writeError( $str, 'eZCodeMapper::error' );
00096         }
00097     }
00098 
00099     /*!
00100      Outputs warning \a $text found in parsed file at position \a $position.
00101     */
00102     function warning( $text, $position = false )
00103     {
00104         if ( $position )
00105         {
00106             $str = $position['file'] . ':' . $position['from'][0] . ' C' . $position['from'][1];
00107             if ( isset( $position['to'] ) )
00108                 $str .= ' -> L' . $position['to'][0] . ' C' . $position['to'][1];
00109             $str .= ':';
00110         }
00111         $str .= $text;
00112         if ( class_exists( 'ezcli' ) )
00113         {
00114             //include_once( 'lib/ezutils/classes/ezcli.php' );
00115             $cli = eZCLI::instance();
00116             $cli->warning( $str );
00117         }
00118         else
00119         {
00120             eZDebug::writeWarning( $str, 'eZCodeMapper::warning' );
00121         }
00122     }
00123 
00124     /*!
00125      \return \c true if the transformation file is already loaded.
00126     */
00127     function isTranformationLoaded( $name )
00128     {
00129         return in_array( $name, $this->TransformationFiles );
00130     }
00131 
00132     /*!
00133      Loads all transformation files defined in \c transform.ini to the current
00134      mapper. It will also load any transformations found in extensions.
00135 
00136      \param $currentCharset The name of the current charset in use. The caller must
00137                             make sure this is not an alias by using eZCharsetInfo::realCharsetCode()
00138      \param $transformationGroup The transformation group which is currently used or \c false for none.
00139     */
00140     function loadTransformationFiles( $currentCharset, $transformationGroup )
00141     {
00142         $ini = eZINI::instance( 'transform.ini' );
00143         $repositoryList = array( $ini->variable( 'Transformation', 'Repository' ) );
00144         $files = $ini->variable( 'Transformation', 'Files' );
00145         //include_once( 'lib/ezutils/classes/ezextension.php' );
00146         $extensions = $ini->variable( 'Transformation', 'Extensions' );
00147         $repositoryList = array_merge( $repositoryList,
00148                                        eZExtension::expandedPathList( $extensions, 'transformations' ) );
00149 
00150         // Check if the current charset maps to a unicode group
00151         // If it does it can trigger loading of additional files
00152         $unicodeGroups = array();
00153         $charsets = $ini->variable( 'Transformation', 'Charsets' );
00154         foreach ( $charsets as $entry )
00155         {
00156             list ( $charset, $group ) = explode( ';', $entry, 2 );
00157             $charset = eZCharsetInfo::realCharsetCode( $charset );
00158             if ( $charset == $currentCharset )
00159             {
00160                 if ( !in_array( $group, $unicodeGroups ) )
00161                     $unicodeGroups[] = $group;
00162             }
00163         }
00164 
00165         // If we are using transformation groups then add that as
00166         // a unicode group. This causes it load transformation files
00167         // specific to that group.
00168         if ( $transformationGroup !== false )
00169             $unicodeGroups[] = $transformationGroup;
00170 
00171         // Add any extra files from the unicode groups
00172         foreach ( $unicodeGroups as $unicodeGroup )
00173         {
00174             if ( $ini->hasGroup( $unicodeGroup ) )
00175             {
00176                 $files = array_merge( $files, $ini->variable( $unicodeGroup, 'Files' ) );
00177                 $extensions = $ini->variable( $unicodeGroup, 'Extensions' );
00178                 $repositoryList = array_merge( $repositoryList,
00179                                                eZExtension::expandedPathList( $extensions, 'transformations' ) );
00180             }
00181         }
00182 
00183         foreach ( $files as $file )
00184         {
00185             // Only load files that are not currently loaded
00186             if ( $this->isTranformationLoaded( $file ) )
00187                 continue;
00188 
00189             foreach ( $repositoryList as $repository )
00190             {
00191                 $trFile = $repository . '/' . $file;
00192                 if ( file_exists( $trFile ) )
00193                 {
00194                     $this->parseTransformationFile( $trFile, $file );
00195                 }
00196             }
00197         }
00198     }
00199 
00200     /*!
00201      Parses the transformation file \a $filename and appends any rules it finds
00202      to the current rule list.
00203      \param $name The name of transformation file as it was requested, ie. without a path
00204     */
00205     function parseTransformationFile( $filename, $name )
00206     {
00207 //         eZDebug::writeDebug( "Parsing file $filename" );
00208         $tbl = array();
00209 
00210         $fd = fopen( $filename, "rb" );
00211         if ( !$fd )
00212         {
00213             $this->error( "Failed opening $filename" );
00214             return false;
00215         }
00216 
00217         $this->TransformationFiles[] = $name;
00218 
00219         //include_once( 'lib/ezi18n/classes/eztextcodec.php' );
00220         //include_once( 'lib/ezi18n/classes/ezcharsetinfo.php' );
00221         $this->ISOUnicodeCodec = eZTextCodec::instance( 'iso-8859-1', 'unicode' );
00222 
00223         $buffer = '';
00224         $lineNum = 1;
00225         $i = 0;
00226         $hexValues = "0123456789abcdefABCDEF";
00227         $identifier = false;
00228 
00229         // The big funky parser starts here
00230         // It starts by reading a chunk of data from the file
00231         // then splits everything into an array with lines.
00232         // Then it traverses one line at a time looking for
00233         // identifiers and rules. Comments will be removed before the
00234         // line is parsed for identifiers and rules.
00235 
00236         while ( !feof( $fd ) or strlen( $buffer ) > 0 )
00237         {
00238             $lines = array();
00239             $len = strlen( $buffer );
00240             // Check if we have data in the buffer yet
00241             // Note: The actual buffer reading is done at the end of this while loop
00242             if ( $len > 0 )
00243             {
00244                 $endPos = false;
00245                 $eolPos = 0;
00246                 // Look for complete lines and append to $lines
00247                 while ( $eolPos !== false and $eolPos < $len )
00248                 {
00249                     $eolPos = strpos( $buffer, "\n", $endPos );
00250                     if ( $eolPos !== false )
00251                     {
00252                         $line = substr( $buffer, $endPos, $eolPos - $endPos );
00253                         $lines[] = array( 'text' => $line,
00254                                           'line' => $lineNum );
00255                         ++$lineNum;
00256                         $endPos = $eolPos + 1;
00257                     }
00258                 }
00259 
00260                 // If we have leftover data place that back in $buffer
00261                 if ( $endPos !== false )
00262                 {
00263                     $buffer = substr( $buffer, $endPos );
00264                 }
00265             }
00266 
00267             // Once we have some lines start parsing them one at a time
00268             foreach ( $lines as $lineData )
00269             {
00270                 $line = $lineData['text'];
00271                 $lineOrg = $line;
00272                 $linePos = $lineData['line'];
00273                 $commentPos = strpos( $line, '#' );
00274                 $origLine = $line;
00275                 // Get rid of any comments before we check the line
00276                 if ( $commentPos !== false )
00277                 {
00278                     $line = substr( $line, 0, $commentPos );
00279                 }
00280                 $trimLine = trim( $line );
00281                 // Skip empty lines
00282                 if ( strlen( $trimLine ) == 0 )
00283                     continue;
00284 
00285 //                 print( "Line: '$line'\n" );
00286 
00287                 $unicodeData = false;
00288 
00289                 $sourceValue = false;
00290                 $sourceEndValue = false;
00291                 $destinationValues = false;
00292                 $transposeValue = false;
00293                 $transposeAdd = true;
00294                 $moduloValue = 1;
00295                 // source, marker, range_input, range_marker, map_input, transpose_input, replace_input
00296                 $state = 'source';
00297                 // map, transpose, replace
00298                 $type = false;
00299 
00300                 $len = strlen( $line );
00301                 if ( preg_match( '#^(.+):[ \t]*$#', $line, $matches ) )
00302                 {
00303                     $identifier = $matches[1];
00304                     if ( !preg_match( '#^[a-zA-Z_-][a-zA-Z0-9_-]*$#', $identifier ) )
00305                     {
00306                         $this->warning( "Invalid identifier '$identifier', can only contain a-z, a-Z - and _",
00307                                       array( 'file' => $filename, 'from' => array( $linePos, strlen( $identifier ) ) ) );
00308                         $identifier = false;
00309                         continue;
00310                     }
00311 //                     print( "identifier '$identifier'\n" );
00312                     continue;
00313                 }
00314                 else if ( $identifier === false )
00315                 {
00316                     $this->warning( "No identifier defined yet, skipping: '" . $line . "'",
00317                                     array( 'file' => $filename, 'from' => array( $linePos, 0 ) ) );
00318                     continue;
00319                 }
00320                 else
00321                 {
00322                     $pos = 0;
00323                     $col = 0;
00324                     $failed = false;
00325                     while ( $pos < $len )
00326                     {
00327                         while ( $pos < $len and
00328                                 ( $line[$pos] == ' ' or
00329                                   $line[$pos] == "\t" ) )
00330                         {
00331                             ++$pos;
00332                         }
00333                         if ( $pos >= $len )
00334                             break;
00335 
00336                         $char = $line[$pos];
00337                         $unicodeData = false;
00338                         if ( $char == '"' )
00339                         {
00340                             $delimiterPos = $pos;
00341                             while ( $delimiterPos < $len )
00342                             {
00343                                 $delimiterPos = strpos( $line, '"', $delimiterPos + 1 );
00344                                 if ( $delimiterPos === false or
00345                                      $delimiterPos <= $pos + 1 or
00346                                      $line[$delimiterPos - 1] != "\\" )
00347                                     break;
00348                             }
00349                             if ( $delimiterPos === false )
00350                             {
00351                                 $this->warning( "No end-quote found for line, skipping: '$line'",
00352                                                 array( 'file' => $filename,
00353                                                        'from' => array( $linePos, $pos ),
00354                                                        'to' => array( $linePos, strlen( $line ) ) ) );
00355                                 $pos = $len;
00356                                 $failed = true;
00357                                 break;
00358                             }
00359                             $str = str_replace( array( "\\\"", "\\\\" ),
00360                                                 array( "\"", "\\" ),
00361                                                 substr( $line, $pos + 1, $delimiterPos - $pos - 1 ) );
00362 //                             print( "string '$str'\n" );
00363                             $pos = $delimiterPos + 1;
00364                             $unicodeData = array( 'value' => $str,
00365                                                   'type' => 'string' );
00366                         }
00367                         else if ( $char == 'U' and
00368                              $pos + 1 < $len and
00369                              $line[$pos + 1] == '+' )
00370                         {
00371                             $hexPos = $pos + 2;
00372                             if ( $hexPos + 4 > $len )
00373                             {
00374                                 $col = $hexPos;
00375                                 $this->warning( "Found U+ value with " . ( 4 - ( $len - $hexPos ) ) . " missing hex numbers",
00376                                                 array( 'file' => $filename,
00377                                                        'from' => array( $linePos, $hexPos ) ) );
00378                                 $failed = true;
00379                                 $pos = $hexPos;
00380                                 break;
00381                             }
00382                             $hasHexValues = true;
00383                             for ( $offset = 0; $offset < 4; ++$offset )
00384                             {
00385                                 $hexChar = $line[$hexPos + $offset];
00386                                 if ( $hexChar == ' ' or
00387                                      $hexChar == "\t" )
00388                                 {
00389                                     $col = $hexPos + $offset;
00390                                     $hasHexValues = false;
00391                                     $this->warning( "Found U+ value with " . ( 4 - $offset ) . " missing hex numbers",
00392                                                     array( 'file' => $filename,
00393                                                            'from' => array( $linePos, $hexPos ),
00394                                                            'to' => array( $linePos, $hexPos + $offset ) ) );
00395                                     $failed = true;
00396                                     $pos = $hexPos + $offset;
00397                                     break;
00398                                 }
00399                                 if ( strpos( $hexValues, $hexChar ) === false )
00400                                 {
00401                                     $col = $hexPos + $offset;
00402                                     $hasHexValues = false;
00403                                     $this->warning( "Found U+ value with invalid hex numbers ($hexChar)",
00404                                                     array( 'file' => $filename,
00405                                                            'from' => array( $linePos, $hexPos ),
00406                                                            'to' => array( $linePos, $hexPos + $offset ) ) );
00407                                     $pos = $hexPos + $offset;
00408                                     $failed = true;
00409                                     break;
00410                                 }
00411                             }
00412                             if ( $failed )
00413                                 break;
00414                             if ( $hasHexValues )
00415                             {
00416                                 $unicodeValue = hexdec( substr( $line, $hexPos, 4 ) );
00417                                 $unicodeData = array( 'value' => $unicodeValue,
00418                                                       'type' => 'unicode' );
00419 //                                 print( "unicode U+ '$unicodeValue'\n" );
00420                             }
00421                             $pos = $hexPos + 4;
00422                         }
00423                         else if ( strpos( $hexValues, $char ) !== false and
00424                                   $pos + 1 < $len and
00425                                   strpos( $hexValues, $line[$pos + 1] ) !== false )
00426                         {
00427                             $hexPos = $pos;
00428                             if ( $hexPos + 2 > $len )
00429                             {
00430                                 $col = $len;
00431                                 $this->warning( "Found ASCII value with " . ( 2 - ( $len - $hexPos ) ) . " missing hex numbers",
00432                                                 array( 'file' => $filename,
00433                                                        'from' => array( $linePos, $hexPos ) ) );
00434                                 $pos = $hexPos;
00435                                 $failed = true;
00436                                 break;
00437                             }
00438                             $hasHexValues = true;
00439                             for ( $offset = 0; $offset < 2; ++$offset )
00440                             {
00441                                 $hexChar = $line[$hexPos + $offset];
00442                                 if ( $hexChar == ' ' or
00443                                      $hexChar == "\t" )
00444                                 {
00445                                     $col = $hexPos + $offset;
00446                                     $hasHexValues = false;
00447                                     $this->warning( "Found ASCII value with " . ( 2 - $offset ) . " missing hex numbers",
00448                                                     array( 'file' => $filename,
00449                                                            'from' => array( $linePos, $hexPos ),
00450                                                            'to' => array( $linePos, $hexPos + $offset ) ) );
00451                                     $pos = $hexPos + $offset;
00452                                     $failed = true;
00453                                     break;
00454                                 }
00455                                 if ( strpos( $hexValues, $hexChar ) === false )
00456                                 {
00457                                     $col = $hexPos + $offset;
00458                                     $hasHexValues = false;
00459                                     $this->warning( "Found ASCII value with invalid hex numbers ($hexChar)",
00460                                                     array( 'file' => $filename,
00461                                                            'from' => array( $linePos, $hexPos ),
00462                                                            'to' => array( $linePos, $hexPos + $offset ) ) );
00463                                     $pos = $hexPos + $offset;
00464                                     $failed = true;
00465                                     break;
00466                                 }
00467                             }
00468                             if ( $failed )
00469                                 break;
00470                             if ( $hasHexValues )
00471                             {
00472                                 $asciiValue = hexdec( substr( $line, $hexPos, 4 ) );
00473 //                                 print( "unicode ASCII '$asciiValue'\n" );
00474                                 $unicodeData = array( 'value' => $asciiValue,
00475                                                       'type' => 'ascii' );
00476                             }
00477                             $pos = $hexPos + 2;
00478                         }
00479                         else if ( substr( $line, $pos, 6 ) == 'remove' )
00480                         {
00481 //                             print( "remove character\n" );
00482                             $unicodeData = array( 'value' => false,
00483                                                   'type' => 'remove' );
00484                             $pos += 6;
00485                         }
00486                         else if ( substr( $line, $pos, 4 ) == 'keep' )
00487                         {
00488 //                             print( "keep character\n" );
00489                             $unicodeData = array( 'value' => true,
00490                                                   'type' => 'keep' );
00491                             $pos += 4;
00492                         }
00493 
00494                         if ( $unicodeData )
00495                         {
00496 //                             print( "data state: $state\n" );
00497                             // source, marker, range_input, range_marker, map_input, transpose_input, replace_input, transpose_modulo
00498                             if ( $state == 'source' )
00499                             {
00500                                 if ( $unicodeData['type'] == 'string' and
00501                                      strlen( $unicodeData['value'] ) > 1 )
00502                                 {
00503                                     $this->warning( "Text string with more than one character cannot be used as input value '" . $unicodeData['value'] . "'",
00504                                                     array( 'file' => $filename,
00505                                                            'from' => array( $linePos, $pos ) ) );
00506                                     $failed = true;
00507                                     break;
00508                                 }
00509                                 $sourceValue = $this->extractUnicodeValue( $unicodeData );
00510                                 $state = 'marker';
00511                             }
00512                             else if ( $state == 'marker' )
00513                             {
00514                                 $this->warning( "Source value not expected, a source value has already been extracted at $line" . "[$pos]",
00515                                                 array( 'file' => $filename,
00516                                                        'from' => array( $linePos, $pos ) ) );
00517                                 $failed = true;
00518                                 break;
00519                             }
00520                             else if ( $state == 'range_input' )
00521                             {
00522                                 if ( $unicodeData['type'] == 'string' and
00523                                      strlen( $unicodeData['value'] ) > 1 )
00524                                 {
00525                                     $this->warning( "Text string with more than one character cannot be used as range end value '" . $unicodeData['value'] . "'",
00526                                                     array( 'file' => $filename,
00527                                                            'from' => array( $linePos, $pos ) ) );
00528                                     $failed = true;
00529                                     break;
00530                                 }
00531                                 $sourceEndValue = $this->extractUnicodeValue( $unicodeData );
00532                                 $state = 'range_marker_or_modulo';
00533                             }
00534                             else if ( $state == 'range_marker_or_modulo' or
00535                                       $state == 'range_marker' )
00536                             {
00537                                 $this->warning( "Range value not expected, a range value has already been extracted at $line" . "[$pos]",
00538                                                 array( 'file' => $filename,
00539                                                        'from' => array( $linePos, $pos ) ) );
00540                                 $failed = true;
00541                                 break;
00542                             }
00543                             else if ( $state == 'map_input' )
00544                             {
00545                                 if ( !is_array( $destinationValues ) )
00546                                     $destinationValues = array();
00547                                 $destinationValues = array_merge( $destinationValues,
00548                                                                   $this->extractUnicodeValues( $unicodeData ) );
00549                                 $type = 'map';
00550                             }
00551                             else if ( $state == 'replace_input' )
00552                             {
00553                                 if ( !is_array( $destinationValues ) )
00554                                     $destinationValues = array();
00555                                 $destinationValues = array_merge( $destinationValues,
00556                                                                   $this->extractUnicodeValues( $unicodeData ) );
00557                                 $type = 'replace';
00558                             }
00559                             else if ( $state == 'transpose_input' )
00560                             {
00561                                 if ( $unicodeData['type'] == 'string' and
00562                                      strlen( $unicodeData['value'] ) > 1 )
00563                                 {
00564                                     $this->warning( "Text string with more than one character cannot be used as transpose value '" . $unicodeData['value'] . "'",
00565                                                     array( 'file' => $filename,
00566                                                            'from' => array( $linePos, $pos ) ) );
00567                                     $failed = true;
00568                                     break;
00569                                 }
00570                                 $transposeValue = $this->extractUnicodeValue( $unicodeData );
00571                                 $type = 'transpose';
00572                             }
00573                             else if ( $state == 'transpose_modulo' )
00574                             {
00575                                 if ( $unicodeData['type'] == 'string' and
00576                                      strlen( $unicodeData['value'] ) > 1 )
00577                                 {
00578                                     $this->warning( "Text string with more than one character cannot be used as transpose modulo value '" . $unicodeData['value'] . "'",
00579                                                     array( 'file' => $filename,
00580                                                            'from' => array( $linePos, $pos ) ) );
00581                                     $failed = true;
00582                                     break;
00583                                 }
00584                                 $moduloValue = $this->extractUnicodeValue( $unicodeData );
00585                                 if ( $moduloValue == 0 )
00586                                 {
00587                                     $this->error( "Modulo value of 0 is not allowed, 1 will be used instead",
00588                                                   array( 'file' => $filename,
00589                                                          'from' => array( $linePos, $pos ) ) );
00590                                     // Note: There is another 0 check in generateSimpleMappingTable()
00591                                 }
00592 //                                 print( "modulo value=$moduloValue\n" );
00593                                 $state = 'range_marker';
00594                             }
00595                         }
00596                         else if ( !$failed )
00597                         {
00598 //                             print( "command state: $state\n" );
00599                             // source, marker, range_input, range_marker, map_input, transpose_input, replace_input
00600                             if ( $state == 'source' )
00601                             {
00602                                 if ( $char == '=' )
00603                                 {
00604                                     $this->warning( "Cannot use map marker $char without prior character value",
00605                                                     array( 'file' => $filename,
00606                                                            'from' => array( $linePos, $pos ) ) );
00607                                     $failed = true;
00608                                     break;
00609                                 }
00610                                 else if ( $char == '+' or
00611                                           $char == '-' )
00612                                 {
00613                                     $this->warning( "Cannot use range marker $char without prior character value",
00614                                                     array( 'file' => $filename,
00615                                                            'from' => array( $linePos, $pos ) ) );
00616                                     $failed = true;
00617                                     break;
00618                                 }
00619                                 else
00620                                 {
00621                                     $this->warning( "Unknown character '$char', expecting input value",
00622                                                     array( 'file' => $filename,
00623                                                            'from' => array( $linePos, $pos ) ) );
00624                                     $failed = true;
00625                                     break;
00626                                 }
00627                             }
00628                             else if ( $state == 'marker' )
00629                             {
00630                                 if ( $char == '=' )
00631                                 {
00632                                     $state = 'map_input';
00633                                     ++$pos;
00634                                 }
00635                                 else if ( $char == '-' )
00636                                 {
00637                                     $state = 'range_input';
00638                                     ++$pos;
00639                                 }
00640                                 else if ( $char == '+' )
00641                                 {
00642                                     $this->warning( "Cannot use range marker $char without prior character value",
00643                                                     array( 'file' => $filename,
00644                                                            'from' => array( $linePos, $pos ) ) );
00645                                     $failed = true;
00646                                     break;
00647                                 }
00648                                 else
00649                                 {
00650                                     $this->warning( "Unknown character '$char', expecting marker",
00651                                                     array( 'file' => $filename,
00652                                                            'from' => array( $linePos, $pos ) ) );
00653                                     $failed = true;
00654                                     break;
00655                                 }
00656                             }
00657                             else if ( $state == 'range_marker_or_modulo' or
00658                                       $state == 'range_marker' )
00659                             {
00660                                 if ( $state == 'range_marker_or_modulo' and
00661                                      $char == '%' )
00662                                 {
00663 //                                     print( "found modulo marker\n" );
00664                                     // Look for modulo value
00665                                     $state = 'transpose_modulo';
00666                                     ++$pos;
00667                                 }
00668                                 else if ( $char == '=' )
00669                                 {
00670                                     $state = 'replace_input';
00671                                     ++$pos;
00672                                 }
00673                                 else if ( $char == '-' or
00674                                           $char == '+' )
00675                                 {
00676                                     $transposeAdd = ( $char == '+' ? true : false );
00677                                     $state = 'transpose_input';
00678                                     ++$pos;
00679                                 }
00680                                 else
00681                                 {
00682                                     $this->warning( "Unknown character '$char', expecting range end value",
00683                                                     array( 'file' => $filename,
00684                                                            'from' => array( $linePos, $pos ) ) );
00685                                     $failed = true;
00686                                     break;
00687                                 }
00688                             }
00689                             else if ( $state == 'map_input' )
00690                             {
00691                                 if ( $char == '=' )
00692                                 {
00693                                     $this->warning( "Duplicate mapping marker $char",
00694                                                     array( 'file' => $filename,
00695                                                            'from' => array( $linePos, $pos ) ) );
00696                                     $failed = true;
00697                                     break;
00698                                 }
00699                                 else if ( $char == '-' or
00700                                           $char == '+' )
00701                                 {
00702                                     $this->warning( "Already mapping values, cannot use range/transpose marker $char",
00703                                                     array( 'file' => $filename,
00704                                                            'from' => array( $linePos, $pos ) ) );
00705                                     $failed = true;
00706                                     break;
00707                                 }
00708                                 else
00709                                 {
00710                                     $this->warning( "Unknown character '$char', expecting output values",
00711                                                     array( 'file' => $filename,
00712                                                            'from' => array( $linePos, $pos ) ) );
00713                                     $failed = true;
00714                                     break;
00715                                 }
00716                             }
00717                             else if ( $state == 'transpose_modulo' )
00718                             {
00719                                 if ( $char == '%' )
00720                                 {
00721                                     $this->warning( "Modulo marker already used, cannot use $char",
00722                                                     array( 'file' => $filename,
00723                                                            'from' => array( $linePos, $pos ) ) );
00724                                     $failed = true;
00725                                     break;
00726                                 }
00727                                 else if ( $char == '-' or
00728                                           $char == '+' )
00729                                 {
00730                                     $this->warning( "Transpose marker $char used, but no modulo value has been found yet",
00731                                                     array( 'file' => $filename,
00732                                                            'from' => array( $linePos, $pos ) ) );
00733                                     $failed = true;
00734                                     break;
00735                                 }
00736                                 else
00737                                 {
00738                                     $this->warning( "Unknown character '$char', expecting modulo value",
00739                                                     array( 'file' => $filename,
00740                                                            'from' => array( $linePos, $pos ) ) );
00741                                     $failed = true;
00742                                     break;
00743                                 }
00744                             }
00745                             else if ( $state == 'transpose_input' )
00746                             {
00747                                 if ( $char == '=' )
00748                                 {
00749                                     $this->warning( "Already transposing, cannot use mapping marker $char",
00750                                                     array( 'file' => $filename,
00751                                                            'from' => array( $linePos, $pos ) ) );
00752                                     $failed = true;
00753                                     break;
00754                                 }
00755                                 else if ( $char == '-' or
00756                                           $char == '+' )
00757                                 {
00758                                     $this->warning( "Duplicate transpose marker $char",
00759                                                     array( 'file' => $filename,
00760                                                            'from' => array( $linePos, $pos ) ) );
00761                                     $failed = true;
00762                                     break;
00763                                 }
00764                                 else
00765                                 {
00766                                     $this->warning( "Unknown character '$char', expecting transpose value",
00767                                                     array( 'file' => $filename,
00768                                                            'from' => array( $linePos, $pos ) ) );
00769                                     $failed = true;
00770                                     break;
00771                                 }
00772                             }
00773                             else if ( $state == 'replace_input' )
00774                             {
00775                                 if ( $char == '=' )
00776                                 {
00777                                     $this->warning( "Already replacing, cannot use mapping marker $char",
00778                                                     array( 'file' => $filename,
00779                                                            'from' => array( $linePos, $pos ) ) );
00780                                     $failed = true;
00781                                     break;
00782                                 }
00783                                 else if ( $char == '-' or
00784                                           $char == '+' )
00785                                 {
00786                                     $this->warning( "Already replacing, cannot use transpose marker $char",
00787                                                     array( 'file' => $filename,
00788                                                            'from' => array( $linePos, $pos ) ) );
00789                                     $failed = true;
00790                                     break;
00791                                 }
00792                                 else
00793                                 {
00794                                     $this->warning( "Unknown character '$char', expecting replace value",
00795                                                     array( 'file' => $filename,
00796                                                            'from' => array( $linePos, $pos ) ) );
00797                                     $failed = true;
00798                                     break;
00799                                 }
00800                             }
00801                         }
00802                     }
00803                     if ( !$failed )
00804                     {
00805                         if ( $identifier )
00806                         {
00807 //                             print( "\nGot type '$type'\n" );
00808 //                            if ( is_array( $destinationValues ) )
00809 //                                $destinationValues = array_diff( $destinationValues, array( '' ) );
00810 
00811                             if ( !isset( $tbl[$identifier] ) )
00812                                 $tbl[$identifier] = array();
00813 
00814                             if ( $type == 'map' )
00815                             {
00816 //                                 print( "***mapping***:\n" . $sourceValue . ' => ' . implode( ', ', $destinationValues ) . "\n\n" );
00817                                 $this->appendDirectMapping( $tbl[$identifier], $identifier, $sourceValue, $destinationValues );
00818                             }
00819                             else if ( $type == 'replace' )
00820                             {
00821 //                                 print( "***replacing***:\n" . $sourceValue . ' - ' . $sourceEndValue . ' => ' . implode( ', ', $destinationValues ) . "\n\n" );
00822                                 $this->appendReplaceMapping( $tbl[$identifier], $identifier, $sourceValue, $sourceEndValue, $destinationValues );
00823                             }
00824                             else if ( $type == 'transpose' )
00825                             {
00826 //                                 print( "***transposing***:\n" . $sourceValue . ' - ' . $sourceEndValue . ' % ' . $moduloValue . ' + ' . $transposeValue . "\n\n" );
00827                                 $this->appendTransposeMapping( $tbl[$identifier], $identifier, $sourceValue, $sourceEndValue, $transposeValue, $transposeAdd, $moduloValue );
00828                             }
00829                         }
00830 //                         else
00831 //                         {
00832 //                             print( "No identifier found yet, skipping entry!!!!!!!!!!\n" );
00833 //                         }
00834                     }
00835                     else
00836                     {
00837 //                         $this->warning( "Failed adding mapper",
00838 //                                         array( 'file' => $filename,
00839 //                                                'from' => array( $linePos, $pos ) ) );
00840                     }
00841                 }
00842             }
00843 
00844             // Here we read more data from the file, appending to
00845             // the $buffer variable
00846             if ( !feof( $fd ) )
00847             {
00848                 $buffer .= fread( $fd, 4096 );
00849 
00850                 // Make sure we have Unix endline characters
00851                 $buffer = preg_replace( "#(\r\n|\r|\n)#", "\n", $buffer );
00852             }
00853             ++$i;
00854         }
00855 
00856         fclose( $fd );
00857 
00858         $this->TransformationTables = array_merge( $this->TransformationTables, $tbl );
00859     }
00860 
00861     /*!
00862      \private
00863      Appends a mapping from one value to another.
00864      \param $block Current block it is working on
00865      \param $identifier The current identifier it is working on
00866      \param $sourceValue The original value
00867      \param $destinationValues The value it should be mapped to
00868     */
00869     function appendDirectMapping( &$block, $identifier, $sourceValue, $destinationValues )
00870     {
00871         $count = count( $block );
00872         if ( count( $destinationValues ) == 1 )
00873             $destinationValues = array_pop( $destinationValues );
00874         if ( isset( $block[$count - 1] ) and
00875              $block[$count - 1][0] == self::TYPE_DIRECT and
00876              $block[$count - 1][2] == $identifier )
00877         {
00878             $block[$count - 1][1][$sourceValue] = $destinationValues;
00879         }
00880         else
00881         {
00882             $block[] = array( self::TYPE_DIRECT,
00883                               array( $sourceValue => $destinationValues ),
00884                               $identifier );
00885 
00886         }
00887     }
00888 
00889     /*!
00890      \private
00891      Appends a mapping for a range of values into a specific value
00892      \param $block Current block it is working on
00893      \param $identifier The current identifier it is working on
00894      \param $sourceValue The start of the original value
00895      \param $sourceEndValue The ned of the original value
00896      \param $destinationValues The value it should be mapped to
00897     */
00898     function appendReplaceMapping( &$block, $identifier, $sourceValue, $sourceEndValue, $destinationValues )
00899     {
00900         $count = count( $block );
00901         if ( count( $destinationValues ) == 1 )
00902             $destinationValues = array_pop( $destinationValues );
00903         if ( isset( $block[$count - 1] ) and
00904              $block[$count - 1][0] == self::TYPE_REPLACE and
00905              $block[$count - 1][2] == $identifier )
00906         {
00907             $block[$count - 1][1][] = array( $sourceValue, $sourceEndValue, $destinationValues );
00908         }
00909         else
00910         {
00911             $block[] = array( self::TYPE_REPLACE,
00912                               array( array( $sourceValue, $sourceEndValue, $destinationValues ) ),
00913                               $identifier );
00914 
00915         }
00916     }
00917 
00918     /*!
00919      \private
00920      Appends a mapping for characters by transposing them up or down.
00921      \param $block Current block it is working on
00922      \param $identifier The current identifier it is working on
00923      \param $sourceValue The start of the original value
00924      \param $sourceEndValue The ned of the original value
00925      \param $transposeValue How much to transpose the values
00926      \param $addValue If \c true the $transposeValue is added to the range if not it is subtracted.
00927     */
00928     function appendTransposeMapping( &$block, $identifier, $sourceValue, $sourceEndValue, $transposeValue, $addValue, $moduloValue )
00929     {
00930         $count = count( $block );
00931         if ( isset( $block[$count - 1] ) and
00932              $block[$count - 1][0] == self::TYPE_RANGE and
00933              $block[$count - 1][2] == $identifier )
00934         {
00935             $block[$count - 1][1][] = array( $sourceValue, $sourceEndValue, $addValue ? $transposeValue : -$transposeValue, $moduloValue );
00936         }
00937         else
00938         {
00939             $block[] = array( self::TYPE_RANGE,
00940                               array( array( $sourceValue, $sourceEndValue, $addValue ? $transposeValue : -$transposeValue, $moduloValue ) ),
00941                               $identifier );
00942 
00943         }
00944     }
00945 
00946     /*!
00947      \private
00948      \return The first unicod value for the data entry \a $data.
00949     */
00950     function extractUnicodeValue( $data )
00951     {
00952         $type = $data['type'];
00953         if ( $type == 'string' )
00954         {
00955             $list = $this->ISOUnicodeCodec->convertString( $data['value'][0] );
00956             return $list[0];
00957         }
00958         else if ( $type == 'ascii' )
00959         {
00960             return $data['value'];
00961         }
00962         else if ( $type == 'unicode' )
00963         {
00964             return $data['value'];
00965         }
00966         else if ( $type == 'remove' )
00967         {
00968             return false;
00969         }
00970         else if ( $type == 'keep' )
00971         {
00972             return true;
00973         }
00974         return null;
00975     }
00976 
00977     /*!
00978      \private
00979      \return The unicode values for the data entry \a $data.
00980     */
00981     function extractUnicodeValues( $data )
00982     {
00983         $type = $data['type'];
00984         if ( $type == 'string' )
00985         {
00986             return $this->ISOUnicodeCodec->convertString( $data['value'] );
00987         }
00988         else if ( $type == 'ascii' )
00989         {
00990             return array( $data['value'] );
00991         }
00992         else if ( $type == 'unicode' )
00993         {
00994             return array( $data['value'] );
00995         }
00996         else if ( $type == 'remove' )
00997         {
00998             return array( false );
00999         }
01000         else if ( $type == 'keep' )
01001         {
01002             return array( true );
01003         }
01004         return array();
01005     }
01006 
01007     /*!
01008      \private
01009      Goes trough all entries in \a $table and if it finds identifier references
01010      it will fetch the table for that identifier and merge in the current one.
01011      \return The expanded table.
01012     */
01013     function expandInheritance( $table )
01014     {
01015         $newTable = array();
01016         foreach ( $table as $tableItem )
01017         {
01018             if ( is_string( $tableItem ) )
01019             {
01020                 $identifier = $tableItem;
01021                 $subTable = $this->mappingTable( $identifier );
01022                 if ( !$subTable )
01023                 {
01024                     eZDebug::writeError( "Failed to fetch mapping table for identifier: '$identifier'" );
01025                 }
01026                 else
01027                 {
01028                     $subTable = $this->expandInheritance( $subTable );
01029                     $newTable = array_merge( $newTable, $subTable );
01030                 }
01031             }
01032             else
01033             {
01034                 $newTable[] = $tableItem;
01035             }
01036         }
01037         return $newTable;
01038     }
01039 
01040     /*!
01041      Turns the character list $list into an array with ordinal values
01042      \param $list Can be on of these types:
01043                   - String - each character is turned into an ordinal value
01044                   - Numeric - the numeric is used as ordinal value
01045                   - Boolean - means no character
01046                   - Array - each element is turned into an ordinal value by recursion
01047     */
01048     function ordinalValues( $table, $list )
01049     {
01050         $ordinals = array();
01051         if ( is_string( $list ) )
01052         {
01053             $len = strlen( $list );
01054             for ( $offset = 0; $offset < $len; ++$offset )
01055             {
01056                 $ordinals[] = ord( $list[$offset] );
01057             }
01058         }
01059         else if ( is_numeric( $list ) )
01060         {
01061             $ordinals[] = $list;
01062         }
01063         else if ( is_array( $list ) )
01064         {
01065             foreach ( $list as $item )
01066             {
01067                 $ordinals = array_merge( $ordinals, eZCodeMapper::ordinalValues( $table, $item ) );
01068             }
01069         }
01070         $ordinals = eZCodeMapper::mapOrdinals( $table, $ordinals );
01071         return $ordinals;
01072     }
01073 
01074     /*!
01075      Goes trough each ordinal in \a $ordinals and sees if there is mapping for it.
01076      If it is the mapping is applied and used as the new ordinal, if the mapping refers to
01077      an array it will be mapped recursively.
01078     */
01079     function mapOrdinals( $table, $ordinals )
01080     {
01081         $mappedOrdinals = array();
01082         foreach ( $ordinals as $ordinal )
01083         {
01084             while ( !is_array( $ordinal ) and isset( $table[$ordinal] ) )
01085             {
01086                 $ordinal = $table[$ordinal];
01087                 if ( is_array( $ordinal ) )
01088                 {
01089                     $ordinal = eZCodeMapper::mapOrdinals( $table, $ordinal );
01090                 }
01091             }
01092             if ( is_array( $ordinal ) )
01093                 $mappedOrdinals = array_merge( $mappedOrdinals, $ordinal );
01094             else
01095                 $mappedOrdinals[] = $ordinal;
01096         }
01097         return $mappedOrdinals;
01098     }
01099 
01100     /*!
01101      Goes trough all to codes in the mapping table \a $unicodeMap and maps
01102      those that match \a $fromCode into \a $toCode.
01103 
01104      \return \a $unicodeMap
01105     */
01106     protected function mapExistingCodes( $unicodeMap, $fromCode, $toCode )
01107     {
01108         foreach ( $unicodeMap as $from => $to )
01109         {
01110             if ( is_array( $to ) )
01111             {
01112                 $newTo = array();
01113                 foreach ( $to as $ordinal )
01114                 {
01115                     if ( $ordinal == $fromCode )
01116                     {
01117                         $newTo = array_merge( $newTo, array( $toCode ) );
01118                     }
01119                     else
01120                     {
01121                         $newTo[] = $ordinal;
01122                     }
01123                 }
01124                 $unicodeMap[$from] = $newTo;
01125             }
01126             else if ( $to == $fromCode )
01127             {
01128                 $unicodeMap[$from] = $toCode;
01129             }
01130         }
01131         return $unicodeMap;
01132     }
01133 
01134     /*!
01135      Goes trough the mapping rules in the table \a $table and generates a simple
01136      mapping table which maps from one Unicode value to another (or array of values).
01137 
01138      The generation uses backward and forward propagation of the defined mappings
01139      to get the proper end result of a given value.
01140 
01141      \note This method can take a while if lots of rules are used
01142     */
01143     function generateSimpleMappingTable( $table, $allowedRanges )
01144     {
01145         if ( !is_array( $table ) )
01146             return false;
01147         $unicodeMap = array();
01148         foreach ( $table as $tableItem )
01149         {
01150             $type = $tableItem[0];
01151             $item = $tableItem[1];
01152             if ( isset( $tableItem[2] ) )
01153             {
01154                 $identifier = $tableItem[2];
01155 //                print( "identifier: $identifier\n" );
01156             }
01157             if ( $type == self::TYPE_DIRECT )
01158             {
01159                 foreach ( $item as $fromCode => $toCode )
01160                 {
01161 //                    print( "from: $fromCode, to: $toCode\n" );
01162 //                     if ( $fromCode == 1026 )
01163 //                     {
01164 //                         print( "<pre>oldcode<br/>" ); var_dump( $toCode ); print( "</pre>" );
01165 //                     }
01166                     $toCode = eZCodeMapper::ordinalValues( $unicodeMap, $toCode );
01167 //                     if ( $fromCode == 1026 )
01168 //                     {
01169 //                         print( "<pre>newcode<br/>" ); var_dump( $toCode ); print( "</pre>" );
01170 //                     }
01171                     if ( count( $allowedRanges ) == 0 )
01172                     {
01173                         if ( count( $toCode ) == 1 )
01174                             $toCode = $toCode[0];
01175                         // If the mapping already exists we skip it
01176                         if ( isset( $unicodeMap[$fromCode] ) )
01177                             continue;
01178 
01179                         $unicodeMap[$fromCode] = $toCode;
01180                         $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $fromCode, $toCode );
01181                     }
01182                     else
01183                     {
01184                         $allowed = false;
01185                         foreach ( $allowedRanges as $allowedRange )
01186                         {
01187                             if ( $fromCode >= $allowedRange[0] and
01188                                  $fromCode <= $allowedRange[1] )
01189                             {
01190                                 $allowed = true;
01191                                 break;
01192                             }
01193                         }
01194                         if ( !$allowed )
01195                             continue;
01196 
01197                         $toCodeList = $toCode;
01198                         $newToCodeList = array();
01199                         foreach ( $toCodeList as $toCode )
01200                         {
01201                             if ( is_bool( $toCode ) )
01202                             {
01203                                 $newToCodeList[] = $toCode;
01204                                 continue;
01205                             }
01206                             foreach ( $allowedRanges as $allowedRange )
01207                             {
01208                                 if ( $toCode >= $allowedRange[0] and
01209                                      $toCode <= $allowedRange[1] )
01210                                 {
01211                                     break;
01212                                 }
01213                             }
01214                             if ( $allowed )
01215                             {
01216                                 $newToCodeList[] = $toCode;
01217                             }
01218                         }
01219                         $toCode = $newToCodeList;
01220                         if ( count( $toCode ) > 0 )
01221                         {
01222                             if ( count( $toCode ) == 1 )
01223                                 $toCode = $toCode[0];
01224 
01225                             // If the mapping already exists we skip it
01226                             if ( isset( $unicodeMap[$fromCode] ) )
01227                                 continue;
01228 
01229                             $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $fromCode, $toCode );
01230 
01231                             $unicodeMap[$fromCode] = $toCode;
01232                         }
01233                     }
01234                 }
01235             }
01236             else if ( $type == self::TYPE_RANGE )
01237             {
01238                 foreach ( $item as $rangeItem )
01239                 {
01240                     $start = $rangeItem[0];
01241                     $stop = $rangeItem[1];
01242                     if ( $start > $stop )
01243                     {
01244                         $tmp = $stop;
01245                         $stop = $start;
01246                         $start = $tmp;
01247                     }
01248                     $add = $rangeItem[2];
01249                     $modulo = $rangeItem[3];
01250                     // Sanity-check, to avoid infinite loops
01251                     if ( $modulo == 0 )
01252                         $modulo = 1;
01253                     for ( $i = $start; $i <= $stop; $i += $modulo )
01254                     {
01255                         if ( count( $allowedRanges ) == 0 )
01256                         {
01257                             $allowed = true;
01258                         }
01259                         else
01260                         {
01261                             $allowed = false;
01262                             foreach ( $allowedRanges as $allowedRange )
01263                             {
01264                                 if ( $i >= $allowedRange[0] and
01265                                      $i <= $allowedRange[1] )
01266                                 {
01267                                     $allowed = true;
01268                                     break;
01269                                 }
01270                             }
01271                             if ( !$allowed )
01272                                 continue;
01273                         }
01274 
01275                         $replace = $i + $add;
01276                         $replace = eZCodeMapper::ordinalValues( $unicodeMap, $replace );
01277                         if ( count( $allowedRanges ) == 0 )
01278                         {
01279                             if ( count( $replace ) == 0 )
01280                                 $replace = false;
01281                             else if ( count( $replace ) == 1 )
01282                                 $replace = $replace[0];
01283                             $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace );
01284 
01285                             // If the mapping already exists we skip it
01286                             if ( isset( $unicodeMap[$i] ) )
01287                                 continue;
01288 
01289                             $unicodeMap[$i] = $replace;
01290                         }
01291                         else
01292                         {
01293                             $newReplace = array();
01294                             foreach ( $allowedRanges as $allowedRange )
01295                             {
01296                                 foreach ( $replace as $replaceOrdinal )
01297                                 {
01298                                     if ( $replaceOrdinal >= $allowedRange[0] and
01299                                          $replaceOrdinal <= $allowedRange[1] )
01300                                     {
01301                                         $newReplace[] = $replaceOrdinal;
01302                                     }
01303                                 }
01304                             }
01305                             if ( count( $newReplace ) == 0 )
01306                                 $replace = false;
01307                             else if ( count( $newReplace ) == 1 )
01308                                 $replace = $newReplace[0];
01309                             else
01310                                 $replace = $newReplace;
01311 
01312                             // If the mapping already exists we skip it
01313                             if ( isset( $unicodeMap[$i] ) )
01314                                 continue;
01315 
01316                             $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace );
01317                             $unicodeMap[$i] = $replace;
01318                         }
01319                     }
01320                 }
01321             }
01322             else if ( $type == self::TYPE_REPLACE )
01323             {
01324                 foreach ( $item as $rangeItem )
01325                 {
01326                     $start = $rangeItem[0];
01327                     $stop = $rangeItem[1];
01328                     if ( $start > $stop )
01329                     {
01330                         $tmp = $stop;
01331                         $stop = $start;
01332                         $start = $tmp;
01333                     }
01334                     $replace = $rangeItem[2];
01335                     $replace = eZCodeMapper::ordinalValues( $unicodeMap, $replace );
01336                     if ( count( $allowedRanges ) == 0 )
01337                     {
01338                         if ( count( $replace ) == 0 )
01339                             $replace = false;
01340                         else if ( count( $replace ) == 1 )
01341                             $replace = $replace[0];
01342                         for ( $i = $start; $i <= $stop; ++$i )
01343                         {
01344                             // If the mapping already exists we skip it
01345                             if ( isset( $unicodeMap[$i] ) )
01346                                 continue;
01347 
01348                             $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace );
01349                             $unicodeMap[$i] = $replace;
01350                         }
01351                     }
01352                     else
01353                     {
01354                         $newReplace = array();
01355                         foreach ( $allowedRanges as $allowedRange )
01356                         {
01357                             foreach ( $replace as $replaceOrdinal )
01358                             {
01359                                 if ( $replaceOrdinal >= $allowedRange[0] and
01360                                      $replaceOrdinal <= $allowedRange[1] )
01361                                 {
01362                                     $newReplace[] = $replaceOrdinal;
01363                                 }
01364                             }
01365                         }
01366                         if ( count( $newReplace ) == 0 )
01367                             $replace = false;
01368                         else if ( count( $newReplace ) == 1 )
01369                             $replace = $newReplace[0];
01370                         else
01371                             $replace = $newReplace;
01372                         for ( $i = $start; $i <= $stop; ++$i )
01373                         {
01374                             $allowed = false;
01375                             foreach ( $allowedRanges as $allowedRange )
01376                             {
01377                                 if ( $i >= $allowedRange[0] and
01378                                      $i <= $allowedRange[1] )
01379                                 {
01380                                     $allowed = true;
01381                                     break;
01382                                 }
01383                             }
01384                             if ( $allowed )
01385                             {
01386                                 // If the mapping already exists we skip it
01387                                 if ( isset( $unicodeMap[$i] ) )
01388                                     continue;
01389 
01390                                 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace );
01391                                 $unicodeMap[$i] = $replace;
01392                             }
01393                         }
01394                     }
01395                 }
01396             }
01397         }
01398         return $unicodeMap;
01399     }
01400 
01401     /*!
01402      Generates a unicode mapping table for idenfier \a $idenfier.
01403 
01404      \param $identifier Is either a single identifier string or a
01405                         an array with identifiers.
01406      \return The unicode mapping table for all defined identifiers
01407     */
01408     function generateMappingCode( $identifier )
01409     {
01410         if ( !is_array( $identifier ) )
01411             $identifier = array( $identifier );
01412         $table = $this->expandInheritance( $identifier );
01413 
01414         // We allow all characters for now
01415         $allowedRanges = array();
01416         $simpleTable = $this->generateSimpleMappingTable( $table, $allowedRanges );
01417         ksort( $simpleTable );
01418         return $simpleTable;
01419     }
01420 
01421     /*!
01422      Generates a mapping table for the character set $charset.
01423      This will mapping table will only work for that character set but will be much faster
01424      and be fed directly to the strtr() PHP function.
01425      \return the table or \c false if something failed.
01426     */
01427     function generateCharsetMappingTable( $unicodeTable, $charset )
01428     {
01429         //include_once( 'lib/ezi18n/classes/eztextcodec.php' );
01430 
01431         $codec = eZTextCodec::instance( 'unicode', $charset );
01432         if ( !$codec )
01433         {
01434             eZDebug::writeError( "Failed to create textcodec for charset '$charset'" );
01435             return false;
01436         }
01437 
01438         $charsetTable = array();
01439         foreach ( $unicodeTable as $match => $replacement )
01440         {
01441             $matchLocal = $codec->convertString( array( $match ) );
01442             if ( is_array( $replacement ) )
01443             {
01444                 $replacementLocal = $codec->convertString( $replacement );
01445             }
01446             else
01447             {
01448                 $replacementLocal = $codec->convertString( array( $replacement ) );
01449             }
01450             $charsetTable[$matchLocal] = $replacementLocal;
01451         }
01452 
01453         // Make sure longer string entries are placed before the shorter ones
01454         // This is very important when working with utf8 which have
01455         // variable length for characters
01456         krsort( $charsetTable );
01457         return $charsetTable;
01458     }
01459 
01460     /*!
01461      Decodes a command into transformation rules.
01462      \param $name Name of the command
01463      \param $parameters Array of parameters for the command
01464      \return An array with transformation rules.
01465     */
01466     function decodeCommand( $name, $parameters )
01467     {
01468         $names = $this->ruleNames();
01469         $rules = array();
01470         switch ( $name )
01471         {
01472             // Special code handlers
01473             case 'url_cleanup_iri':
01474             case 'url_cleanup':
01475             case 'url_cleanup_compat':
01476             case 'identifier_cleanup':
01477             {
01478             } break;
01479 
01480             case 'normalize':
01481             case 'search_normalize':
01482             case 'decompose':
01483             case 'diacritical':
01484             case 'lowercase':
01485             case 'uppercase':
01486             case 'search_cleanup':
01487             {
01488                 if ( count( $parameters ) == 0 )
01489                 {
01490                     // Include all normalize rules
01491                     foreach ( $names as $rule )
01492                     {
01493                         if ( preg_match( '#_'. $name . '$#', $rule ) )
01494                             $rules[] = $rule;
01495                     }
01496                 }
01497                 else
01498                 {
01499                     foreach ( $parameters as $parameter )
01500                     {
01501                         $rule = $parameter . '_' . $name;
01502                         if ( in_array( $rule, $names ) )
01503                             $rules[] = $rule;
01504                     }
01505                 }
01506             } break;
01507 
01508             case 'transform':
01509             case 'transliterate':
01510             {
01511                 $dividers = array( 'transform' => '_to_',
01512                                    'transliterate' => '_transliterate_' );
01513                 $divider = $dividers[$name];
01514                 if ( count( $parameters ) == 0 )
01515                 {
01516                     // Include all transformation rules
01517                     foreach ( $names as $rule )
01518                     {
01519                         if ( preg_match( '#^[a-zA-Z][a-zA-Z0-9-]+'. $divider . '[a-zA-Z][a-zA-Z0-9-]+$#', $rule ) )
01520                             $rules[] = $rule;
01521                     }
01522                 }
01523                 else if ( count( $parameters ) == 2 )
01524                 {
01525                     $rule = $parameters[0] . $divider . $parameters[1];
01526                     if ( in_array( $rule, $names ) )
01527                         $rules[] = $rule;
01528                 }
01529             } break;
01530 
01531             default:
01532             {
01533                 $ini = eZINI::instance( 'transform.ini' );
01534                 $commands = $ini->variable( 'Extensions', 'Commands' );
01535                 if ( isset( $commands[$name] ) )
01536                 {
01537                     break;
01538                 }
01539                 eZDebug::writeError( "Unknown command '$name'",
01540                                      'eZCharTransform::decodeCommand' );
01541             } break;
01542         }
01543         return $rules;
01544     }
01545 
01546     /*!
01547      Generates PHP code for the command \a $command.
01548      \param $charsetName The name of the charset the text will be in,
01549                          this can be used to generate different code for different charsets.
01550      \return A string containing PHP code or \c false if not supported.
01551     */
01552     function generateCommandCode( $command, $charsetName )
01553     {
01554         if ( $command['command'] == 'url_cleanup_iri' )
01555         {
01556             $charsetNameTxt = var_export( $charsetName, true );
01557             $code = "\$text = eZCharTransform::commandUrlCleanupIRI( \$text, $charsetNameTxt );\n";
01558             return $code;
01559         }
01560         else if ( $command['command'] == 'url_cleanup' )
01561         {
01562             $charsetNameTxt = var_export( $charsetName, true );
01563             $code = "\$text = eZCharTransform::commandUrlCleanup( \$text, $charsetNameTxt );\n";
01564             return $code;
01565         }
01566         else if ( $command['command'] == 'url_cleanup_compat' )
01567         {
01568             $charsetNameTxt = var_export( $charsetName, true );
01569             $code = "\$text = eZCharTransform::commandUrlCleanupCompat( \$text, $charsetNameTxt );\n";
01570             return $code;
01571         }
01572         else if ( $command['command'] == 'identifier_cleanup' )
01573         {
01574             $code = ( "\$text = strtolower( \$text );\n" .
01575                       "\$text = preg_replace( array( \"#[^a-z0-9_ ]#\",\n" .
01576                       "                             \"/ /\",\n" .
01577                       "                             \"/__+/\",\n" .
01578                       "                             \"/^_|_$/\" ),\n" .
01579                       "                      array( \" \",\n" .
01580                       "                             \"_\",\n" .
01581                       "                             \"_\",\n" .
01582                       "                             \"\" ),\n" .
01583                       "                      \$text );\n" );
01584             return $code;
01585         }
01586         else if ( $command['command'] == 'search_cleanup' )
01587         {
01588             $code = '';
01589             $nonCJKCharsets = $this->nonCJKCharsets();
01590             if ( !in_array( $charsetName, $nonCJKCharsets ) )
01591             {
01592                 $code .= ( '// add N-Gram(N=2)  chinese / japanese / korean multibyte characters' . "\n" .
01593                            '//include_once( \'lib/ezi18n/classes/eztextcodec.php\' );' . "\n" .
01594                            '$codec = eZTextCodec::instance( false, \'unicode\' );' . "\n" .
01595                            "\n" .
01596                            '$unicodeValueArray = $codec->convertString( $text );' . "\n" .
01597                            "\n" .
01598                            '$normalizedTextArray = array();' . "\n" .
01599                            '$bFlag = false;' . "\n" .
01600                           'foreach ( array_keys( $unicodeValueArray ) as $valueKey )' . "\n" .
01601                            '{' . "\n" .
01602                            '    // Check for word characters that should be broken up for search' . "\n" .
01603                            '    if ( ( $unicodeValueArray[$valueKey] >= 12289 and' . "\n" .
01604                            '           $unicodeValueArray[$valueKey] <= 12542 ) or' . "\n" .
01605                            '         ( $unicodeValueArray[$valueKey] >= 13312 and' . "\n" .
01606                            '           $unicodeValueArray[$valueKey] <= 40863 ) or' . "\n" .
01607                            '         ( $unicodeValueArray[$valueKey] >= 44032 and' . "\n" .
01608                            '           $unicodeValueArray[$valueKey] <= 55203 ) )' . "\n" .
01609                            '    {' . "\n" .
01610                            '        if ( $bFlag )' . "\n" .
01611                            '        {' . "\n" .
01612                            '            $normalizedTextArray[] = $unicodeValueArray[$valueKey];' . "\n" .
01613                            '        }' . "\n" .
01614                            '        $normalizedTextArray[] = 32; // A space' . "\n" .
01615                            '        $normalizedTextArray[] = $unicodeValueArray[$valueKey];' . "\n" .
01616                            '        $bFlag = true;' . "\n" .
01617                            '    }' . "\n" .
01618                            '    else' . "\n" .
01619                            '    {' . "\n" .
01620                            '        if ( $bFlag )' . "\n" .
01621                            '        {' . "\n" .
01622                            '            $normalizedTextArray[] = 32; // A space' . "\n" .
01623                            '        }' . "\n" .
01624                            '        $normalizedTextArray[] = $unicodeValueArray[$valueKey];' . "\n" .
01625                            '        $bFlag = false;' . "\n" .
01626                            '    }' . "\n" .
01627                            '}' . "\n" .
01628                            'if ( $bFlag )' . "\n" .
01629                            '{' . "\n" .
01630                            '    $normalizedTextArray[count($normalizedTextArray)-1]=32;' . "\n" .
01631                            '}' . "\n" .
01632                            '$revCodec = eZTextCodec::instance( \'unicode\', false ); // false means use internal charset' . "\n" .
01633                            '$text = $revCodec->convertString( $normalizedTextArray );' . "\n" );
01634             }
01635             $code .= ( '$text = preg_replace( array( "#(\.){2,}#",' . "\n" .
01636                        '                             "#^\.#",' . "\n" .
01637                        '                             "#\s\.#",' . "\n" .
01638                        '                             "#\.\s#",' . "\n" .
01639                        '                             "#\.$#",' . "\n" .
01640                        '                             "#([^0-9])%#" ),' . "\n" .
01641                        '                      array( " ",' . "\n" .
01642                        '                             " ",' . "\n" .
01643                        '                             " ",' . "\n" .
01644                        '                             " ",' . "\n" .
01645                        '                             " ",' . "\n" .
01646                        '                             " " ),' . "\n" .
01647                        '                      $text );' . "\n" .
01648                        '$ini = eZINI::instance();' . "\n" .
01649                        'if ( $ini->variable( \'SearchSettings\', \'EnableWildcard\' ) != \'true\' )' . "\n" .
01650                        '{' . "\n" .
01651                        '    $text = str_replace( "*", " ", $text );' . "\n" .
01652                        '}' . "\n" .
01653                        '$charset = eZTextCodec::internalCharset();' . "\n" .
01654                        '$hasUTF8 = ( $charset == "utf-8" );' . "\n" .
01655                        "\n" .
01656                        'if ( $hasUTF8 )' . "\n" .
01657                        '{' . "\n" .
01658                        '    $text = preg_replace( "#(\s+)#u", " ", $text );' . "\n" .
01659                        '}' . "\n" .
01660                        'else' . "\n" .
01661                        '{' . "\n" .
01662                        '    $text = preg_replace( "#(\s+)#", " ", $text );' . "\n" .
01663                        '}' );
01664 
01665             return $code;
01666         }
01667         else
01668         {
01669             $ini = eZINI::instance( 'transform.ini' );
01670             $commands = $ini->variable( 'Extensions', 'Commands' );
01671             if ( isset( $commands[$command['command']] ) )
01672             {
01673                 list( $path, $className ) = split( ":", $commands[$command['command']], 2 );
01674                 if ( file_exists( $path ) )
01675                 {
01676                     $charsetNameTxt = var_export( $charsetName, true );
01677                     $commandTxt     = var_export( $command['command'], true );
01678                     $pathTxt        = var_export( $path, true );
01679                     $code = "include_once( $pathTxt );\n\$text = $className::executeCommand( \$text, $commandTxt, $charsetNameTxt );\n";
01680                     return $code;
01681                 }
01682                 else
01683                 {
01684                     eZDebug::writeError( "Could not locate include file '$path' for transformation '" . $command['command'] . "'" );
01685                 }
01686             }
01687         }
01688         return false;
01689     }
01690 
01691     /*!
01692      Executes custom PHP code for the command \a $command.
01693      \param $charsetName The name of the charset the text will be in,
01694                          this can be used to execute different code for different charsets.
01695      \return \c true if the command is supported, \c false otherwise.
01696     */
01697     function executeCommandCode( &$text, $command, $charsetName )
01698     {
01699         if ( $command['command'] == 'url_cleanup_iri' )
01700         {
01701             $text = eZCharTransform::commandUrlCleanupIRI( $text, $charsetName );
01702             return true;
01703         }
01704         else if ( $command['command'] == 'url_cleanup' )
01705         {
01706             $text = eZCharTransform::commandUrlCleanup( $text, $charsetName );
01707             return true;
01708         }
01709         else if ( $command['command'] == 'url_cleanup_compat' )
01710         {
01711             $text = eZCharTransform::commandUrlCleanupCompat( $text, $charsetName );
01712             return true;
01713         }
01714         else if ( $command['command'] == 'identifier_cleanup' )
01715         {
01716             $text = strtolower( $text );
01717             $text = preg_replace( array( "#[^a-z0-9_ ]#",
01718                                          "/ /",
01719                                          "/__+/",
01720                                          "/^_|_$/" ),
01721                                   array( " ",
01722                                          "_",
01723                                          "_",
01724                                          "" ),
01725                                   $text );
01726             return true;
01727         }
01728         else if ( $command['command'] == 'search_cleanup' )
01729         {
01730             $nonCJKCharsets = $this->nonCJKCharsets();
01731             if ( !in_array( $charsetName, $nonCJKCharsets ) )
01732             {
01733                 // 4 Add spaces after chinese / japanese / korean multibyte characters
01734                 //include_once( 'lib/ezi18n/classes/eztextcodec.php' );
01735                 $codec = eZTextCodec::instance( false, 'unicode' );
01736 
01737                 $unicodeValueArray = $codec->convertString( $text );
01738 
01739                 $normalizedTextArray = array();
01740                 $bFlag = false;
01741                 foreach ( array_keys( $unicodeValueArray ) as $valueKey )
01742                 {
01743                     // Check for word characters that should be broken up for search
01744                     if ( ( $unicodeValueArray[$valueKey] >= 12289 and
01745                            $unicodeValueArray[$valueKey] <= 12542 ) or
01746                          ( $unicodeValueArray[$valueKey] >= 13312 and
01747                            $unicodeValueArray[$valueKey] <= 40863 ) or
01748                          ( $unicodeValueArray[$valueKey] >= 44032 and
01749                            $unicodeValueArray[$valueKey] <= 55203 ) )
01750                     {
01751                         if ( $bFlag )
01752                         {
01753                             $normalizedTextArray[] = $unicodeValueArray[$valueKey];
01754                         }
01755                         $normalizedTextArray[] = 32; // A space
01756                         $normalizedTextArray[] = $unicodeValueArray[$valueKey];
01757                         $bFlag = true;
01758                     }
01759                     else
01760                     {
01761                         if ( $bFlag )
01762                         {
01763                             $normalizedTextArray[] = 32; // A space
01764                         }
01765                         $normalizedTextArray[] = $unicodeValueArray[$valueKey];
01766                         $bFlag = false;
01767                     }
01768                 }
01769 
01770                 if ( $bFlag )
01771                 {
01772                     $normalizedTextArray[ count( $normalizedTextArray ) - 1 ] = 32;
01773                 }
01774 
01775                 $revCodec = eZTextCodec::instance( 'unicode', false ); // false means use internal charset
01776                 $text = $revCodec->convertString( $normalizedTextArray );
01777             }
01778 
01779             // Make sure dots inside words/numbers are kept, the rest is turned into space
01780             $text = preg_replace( array( "#(\.){2,}#",
01781                                          "#^\.#",
01782                                          "#\s\.#",
01783                                          "#\.\s#",
01784                                          "#\.$#",
01785                                          "#([^0-9])%#" ), // Keep only % after a number
01786                                   array( " ",
01787                                          " ",
01788                                          " ",
01789                                          " ",
01790                                          " ",
01791                                          "$1 " ),
01792                                   $text );
01793             $ini = eZINI::instance();
01794             if ( $ini->variable( 'SearchSettings', 'EnableWildcard' ) != 'true' )
01795             {
01796                 $text = str_replace( "*", " ", $text );
01797             }
01798             $charset = eZTextCodec::internalCharset();
01799             $hasUTF8 = ( $charset == "utf-8" );
01800 
01801             if ( $hasUTF8 )
01802             {
01803                 $text = preg_replace( "#(\s+)#u", " ", $text );
01804             }
01805             else
01806             {
01807                 $text = preg_replace( "#(\s+)#", " ", $text );
01808             }
01809 
01810             return true;
01811         }
01812         else
01813         {
01814             $ini = eZINI::instance( 'transform.ini' );
01815             $commands = $ini->variable( 'Extensions', 'Commands' );
01816             if ( isset( $commands[$command['command']] ) )
01817             {
01818                 list( $path, $className ) = split( ":", $commands[$command['command']], 2 );
01819                 if ( file_exists( $path ) )
01820                 {
01821                     include_once( $path );
01822                     $text = call_user_func_array( array( $className, 'executeCommand' ),
01823                                                   array( $text, $command['command'], $charsetName ) );
01824                     return true;
01825                 }
01826                 else
01827                 {
01828                     eZDebug::writeError( "Could not locate include file '$path' for transformation '" . $command['command'] . "'" );
01829                 }
01830             }
01831         }
01832         return false;
01833     }
01834 
01835     /*!
01836      \return An array with charsets that are certain to not contain CJK characters.
01837     */
01838     function nonCJKCharsets()
01839     {
01840         return array( 'adobe-standard-encoding',
01841                       'cp437', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857',
01842                       'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866',
01843                       'cp869', 'cp874',
01844                       'dec-mcs', 'hp-roman8',
01845                       'iso-8859-1', 'iso-8859-2', 'iso-8859-3', 'iso-8859-4', 'iso-8859-5',
01846                       'iso-8859-6', 'iso-8859-7', 'iso-8859-8', 'iso-8859-9', 'iso-8859-10',
01847                       'iso-8859-11', 'iso-8859-13', 'iso-8859-14', 'iso-8859-15',
01848                       'koi8-r', 'koi8-u', 'macintosh', 'next', 'us-ascii',
01849                       'windows-1250', 'windows-1251', 'windows-1252', 'windows-1253',
01850                       'windows-1254', 'windows-1255', 'windows-1256', 'windows-1257',
01851                       'windows-1258' );
01852     }
01853 
01854     /// \privatesection
01855     public $TransformationTables;
01856     public $TransformationFiles;
01857     public $ISOUnicodeCodec;
01858 }
01859 
01860 ?>