|
eZ Publish
[4.0]
|
00001 <?php 00002 // 00003 // Definition of eZCodeMapper class 00004 // 00005 // Created on: <18-Jun-2004 14:56:15 amos> 00006 // 00007 // ## BEGIN COPYRIGHT, LICENSE AND WARRANTY NOTICE ## 00008 // SOFTWARE NAME: eZ Publish 00009 // SOFTWARE RELEASE: 4.0.x 00010 // COPYRIGHT NOTICE: Copyright (C) 1999-2008 eZ Systems AS 00011 // SOFTWARE LICENSE: GNU General Public License v2.0 00012 // NOTICE: > 00013 // This program is free software; you can redistribute it and/or 00014 // modify it under the terms of version 2.0 of the GNU General 00015 // Public License as published by the Free Software Foundation. 00016 // 00017 // This program is distributed in the hope that it will be useful, 00018 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00019 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00020 // GNU General Public License for more details. 00021 // 00022 // You should have received a copy of version 2.0 of the GNU General 00023 // Public License along with this program; if not, write to the Free 00024 // Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00025 // MA 02110-1301, USA. 00026 // 00027 // 00028 // ## END COPYRIGHT, LICENSE AND WARRANTY NOTICE ## 00029 // 00030 00031 /*! \file ezcodemapper.php 00032 */ 00033 00034 /*! 00035 \class eZCodeMapper ezcodemapper.php 00036 \ingroup eZI18N 00037 \brief Handles mapping of character codes 00038 00039 */ 00040 00041 class eZCodeMapper 00042 { 00043 const TYPE_DIRECT = 1; 00044 const TYPE_RANGE = 2; 00045 const TYPE_REPLACE = 3; 00046 00047 /*! 00048 Constructor 00049 */ 00050 function eZCodeMapper() 00051 { 00052 $this->TransformationTables = array(); 00053 $this->TransformationFiles = array(); 00054 } 00055 00056 /*! 00057 \return The mapping table for identifier \a $identifier or \c false if it is not found. 00058 */ 00059 function mappingTable( $identifier ) 00060 { 00061 if ( isset( $this->TransformationTables[$identifier] ) ) 00062 return $this->TransformationTables[$identifier]; 00063 return false; 00064 } 00065 00066 /*! 00067 \return An array with the names of rules which are currently available. 00068 */ 00069 function ruleNames() 00070 { 00071 return array_keys( $this->TransformationTables ); 00072 } 00073 00074 /*! 00075 Outputs error \a $text found in parsed file at position \a $position. 00076 */ 00077 function error( $text, $position = false ) 00078 { 00079 if ( $position ) 00080 { 00081 $str = $position['file'] . ':' . $position['from'][0] . ' C' . $position['from'][1]; 00082 if ( isset( $position['to'] ) ) 00083 $str .= ' -> L' . $position['to'][0] . ' C' . $position['to'][1]; 00084 $str .= ':'; 00085 } 00086 $str .= $text; 00087 if ( class_exists( 'ezcli' ) ) 00088 { 00089 //include_once( 'lib/ezutils/classes/ezcli.php' ); 00090 $cli = eZCLI::instance(); 00091 $cli->error( $str ); 00092 } 00093 else 00094 { 00095 eZDebug::writeError( $str, 'eZCodeMapper::error' ); 00096 } 00097 } 00098 00099 /*! 00100 Outputs warning \a $text found in parsed file at position \a $position. 00101 */ 00102 function warning( $text, $position = false ) 00103 { 00104 if ( $position ) 00105 { 00106 $str = $position['file'] . ':' . $position['from'][0] . ' C' . $position['from'][1]; 00107 if ( isset( $position['to'] ) ) 00108 $str .= ' -> L' . $position['to'][0] . ' C' . $position['to'][1]; 00109 $str .= ':'; 00110 } 00111 $str .= $text; 00112 if ( class_exists( 'ezcli' ) ) 00113 { 00114 //include_once( 'lib/ezutils/classes/ezcli.php' ); 00115 $cli = eZCLI::instance(); 00116 $cli->warning( $str ); 00117 } 00118 else 00119 { 00120 eZDebug::writeWarning( $str, 'eZCodeMapper::warning' ); 00121 } 00122 } 00123 00124 /*! 00125 \return \c true if the transformation file is already loaded. 00126 */ 00127 function isTranformationLoaded( $name ) 00128 { 00129 return in_array( $name, $this->TransformationFiles ); 00130 } 00131 00132 /*! 00133 Loads all transformation files defined in \c transform.ini to the current 00134 mapper. It will also load any transformations found in extensions. 00135 00136 \param $currentCharset The name of the current charset in use. The caller must 00137 make sure this is not an alias by using eZCharsetInfo::realCharsetCode() 00138 \param $transformationGroup The transformation group which is currently used or \c false for none. 00139 */ 00140 function loadTransformationFiles( $currentCharset, $transformationGroup ) 00141 { 00142 $ini = eZINI::instance( 'transform.ini' ); 00143 $repositoryList = array( $ini->variable( 'Transformation', 'Repository' ) ); 00144 $files = $ini->variable( 'Transformation', 'Files' ); 00145 //include_once( 'lib/ezutils/classes/ezextension.php' ); 00146 $extensions = $ini->variable( 'Transformation', 'Extensions' ); 00147 $repositoryList = array_merge( $repositoryList, 00148 eZExtension::expandedPathList( $extensions, 'transformations' ) ); 00149 00150 // Check if the current charset maps to a unicode group 00151 // If it does it can trigger loading of additional files 00152 $unicodeGroups = array(); 00153 $charsets = $ini->variable( 'Transformation', 'Charsets' ); 00154 foreach ( $charsets as $entry ) 00155 { 00156 list ( $charset, $group ) = explode( ';', $entry, 2 ); 00157 $charset = eZCharsetInfo::realCharsetCode( $charset ); 00158 if ( $charset == $currentCharset ) 00159 { 00160 if ( !in_array( $group, $unicodeGroups ) ) 00161 $unicodeGroups[] = $group; 00162 } 00163 } 00164 00165 // If we are using transformation groups then add that as 00166 // a unicode group. This causes it load transformation files 00167 // specific to that group. 00168 if ( $transformationGroup !== false ) 00169 $unicodeGroups[] = $transformationGroup; 00170 00171 // Add any extra files from the unicode groups 00172 foreach ( $unicodeGroups as $unicodeGroup ) 00173 { 00174 if ( $ini->hasGroup( $unicodeGroup ) ) 00175 { 00176 $files = array_merge( $files, $ini->variable( $unicodeGroup, 'Files' ) ); 00177 $extensions = $ini->variable( $unicodeGroup, 'Extensions' ); 00178 $repositoryList = array_merge( $repositoryList, 00179 eZExtension::expandedPathList( $extensions, 'transformations' ) ); 00180 } 00181 } 00182 00183 foreach ( $files as $file ) 00184 { 00185 // Only load files that are not currently loaded 00186 if ( $this->isTranformationLoaded( $file ) ) 00187 continue; 00188 00189 foreach ( $repositoryList as $repository ) 00190 { 00191 $trFile = $repository . '/' . $file; 00192 if ( file_exists( $trFile ) ) 00193 { 00194 $this->parseTransformationFile( $trFile, $file ); 00195 } 00196 } 00197 } 00198 } 00199 00200 /*! 00201 Parses the transformation file \a $filename and appends any rules it finds 00202 to the current rule list. 00203 \param $name The name of transformation file as it was requested, ie. without a path 00204 */ 00205 function parseTransformationFile( $filename, $name ) 00206 { 00207 // eZDebug::writeDebug( "Parsing file $filename" ); 00208 $tbl = array(); 00209 00210 $fd = fopen( $filename, "rb" ); 00211 if ( !$fd ) 00212 { 00213 $this->error( "Failed opening $filename" ); 00214 return false; 00215 } 00216 00217 $this->TransformationFiles[] = $name; 00218 00219 //include_once( 'lib/ezi18n/classes/eztextcodec.php' ); 00220 //include_once( 'lib/ezi18n/classes/ezcharsetinfo.php' ); 00221 $this->ISOUnicodeCodec = eZTextCodec::instance( 'iso-8859-1', 'unicode' ); 00222 00223 $buffer = ''; 00224 $lineNum = 1; 00225 $i = 0; 00226 $hexValues = "0123456789abcdefABCDEF"; 00227 $identifier = false; 00228 00229 // The big funky parser starts here 00230 // It starts by reading a chunk of data from the file 00231 // then splits everything into an array with lines. 00232 // Then it traverses one line at a time looking for 00233 // identifiers and rules. Comments will be removed before the 00234 // line is parsed for identifiers and rules. 00235 00236 while ( !feof( $fd ) or strlen( $buffer ) > 0 ) 00237 { 00238 $lines = array(); 00239 $len = strlen( $buffer ); 00240 // Check if we have data in the buffer yet 00241 // Note: The actual buffer reading is done at the end of this while loop 00242 if ( $len > 0 ) 00243 { 00244 $endPos = false; 00245 $eolPos = 0; 00246 // Look for complete lines and append to $lines 00247 while ( $eolPos !== false and $eolPos < $len ) 00248 { 00249 $eolPos = strpos( $buffer, "\n", $endPos ); 00250 if ( $eolPos !== false ) 00251 { 00252 $line = substr( $buffer, $endPos, $eolPos - $endPos ); 00253 $lines[] = array( 'text' => $line, 00254 'line' => $lineNum ); 00255 ++$lineNum; 00256 $endPos = $eolPos + 1; 00257 } 00258 } 00259 00260 // If we have leftover data place that back in $buffer 00261 if ( $endPos !== false ) 00262 { 00263 $buffer = substr( $buffer, $endPos ); 00264 } 00265 } 00266 00267 // Once we have some lines start parsing them one at a time 00268 foreach ( $lines as $lineData ) 00269 { 00270 $line = $lineData['text']; 00271 $lineOrg = $line; 00272 $linePos = $lineData['line']; 00273 $commentPos = strpos( $line, '#' ); 00274 $origLine = $line; 00275 // Get rid of any comments before we check the line 00276 if ( $commentPos !== false ) 00277 { 00278 $line = substr( $line, 0, $commentPos ); 00279 } 00280 $trimLine = trim( $line ); 00281 // Skip empty lines 00282 if ( strlen( $trimLine ) == 0 ) 00283 continue; 00284 00285 // print( "Line: '$line'\n" ); 00286 00287 $unicodeData = false; 00288 00289 $sourceValue = false; 00290 $sourceEndValue = false; 00291 $destinationValues = false; 00292 $transposeValue = false; 00293 $transposeAdd = true; 00294 $moduloValue = 1; 00295 // source, marker, range_input, range_marker, map_input, transpose_input, replace_input 00296 $state = 'source'; 00297 // map, transpose, replace 00298 $type = false; 00299 00300 $len = strlen( $line ); 00301 if ( preg_match( '#^(.+):[ \t]*$#', $line, $matches ) ) 00302 { 00303 $identifier = $matches[1]; 00304 if ( !preg_match( '#^[a-zA-Z_-][a-zA-Z0-9_-]*$#', $identifier ) ) 00305 { 00306 $this->warning( "Invalid identifier '$identifier', can only contain a-z, a-Z - and _", 00307 array( 'file' => $filename, 'from' => array( $linePos, strlen( $identifier ) ) ) ); 00308 $identifier = false; 00309 continue; 00310 } 00311 // print( "identifier '$identifier'\n" ); 00312 continue; 00313 } 00314 else if ( $identifier === false ) 00315 { 00316 $this->warning( "No identifier defined yet, skipping: '" . $line . "'", 00317 array( 'file' => $filename, 'from' => array( $linePos, 0 ) ) ); 00318 continue; 00319 } 00320 else 00321 { 00322 $pos = 0; 00323 $col = 0; 00324 $failed = false; 00325 while ( $pos < $len ) 00326 { 00327 while ( $pos < $len and 00328 ( $line[$pos] == ' ' or 00329 $line[$pos] == "\t" ) ) 00330 { 00331 ++$pos; 00332 } 00333 if ( $pos >= $len ) 00334 break; 00335 00336 $char = $line[$pos]; 00337 $unicodeData = false; 00338 if ( $char == '"' ) 00339 { 00340 $delimiterPos = $pos; 00341 while ( $delimiterPos < $len ) 00342 { 00343 $delimiterPos = strpos( $line, '"', $delimiterPos + 1 ); 00344 if ( $delimiterPos === false or 00345 $delimiterPos <= $pos + 1 or 00346 $line[$delimiterPos - 1] != "\\" ) 00347 break; 00348 } 00349 if ( $delimiterPos === false ) 00350 { 00351 $this->warning( "No end-quote found for line, skipping: '$line'", 00352 array( 'file' => $filename, 00353 'from' => array( $linePos, $pos ), 00354 'to' => array( $linePos, strlen( $line ) ) ) ); 00355 $pos = $len; 00356 $failed = true; 00357 break; 00358 } 00359 $str = str_replace( array( "\\\"", "\\\\" ), 00360 array( "\"", "\\" ), 00361 substr( $line, $pos + 1, $delimiterPos - $pos - 1 ) ); 00362 // print( "string '$str'\n" ); 00363 $pos = $delimiterPos + 1; 00364 $unicodeData = array( 'value' => $str, 00365 'type' => 'string' ); 00366 } 00367 else if ( $char == 'U' and 00368 $pos + 1 < $len and 00369 $line[$pos + 1] == '+' ) 00370 { 00371 $hexPos = $pos + 2; 00372 if ( $hexPos + 4 > $len ) 00373 { 00374 $col = $hexPos; 00375 $this->warning( "Found U+ value with " . ( 4 - ( $len - $hexPos ) ) . " missing hex numbers", 00376 array( 'file' => $filename, 00377 'from' => array( $linePos, $hexPos ) ) ); 00378 $failed = true; 00379 $pos = $hexPos; 00380 break; 00381 } 00382 $hasHexValues = true; 00383 for ( $offset = 0; $offset < 4; ++$offset ) 00384 { 00385 $hexChar = $line[$hexPos + $offset]; 00386 if ( $hexChar == ' ' or 00387 $hexChar == "\t" ) 00388 { 00389 $col = $hexPos + $offset; 00390 $hasHexValues = false; 00391 $this->warning( "Found U+ value with " . ( 4 - $offset ) . " missing hex numbers", 00392 array( 'file' => $filename, 00393 'from' => array( $linePos, $hexPos ), 00394 'to' => array( $linePos, $hexPos + $offset ) ) ); 00395 $failed = true; 00396 $pos = $hexPos + $offset; 00397 break; 00398 } 00399 if ( strpos( $hexValues, $hexChar ) === false ) 00400 { 00401 $col = $hexPos + $offset; 00402 $hasHexValues = false; 00403 $this->warning( "Found U+ value with invalid hex numbers ($hexChar)", 00404 array( 'file' => $filename, 00405 'from' => array( $linePos, $hexPos ), 00406 'to' => array( $linePos, $hexPos + $offset ) ) ); 00407 $pos = $hexPos + $offset; 00408 $failed = true; 00409 break; 00410 } 00411 } 00412 if ( $failed ) 00413 break; 00414 if ( $hasHexValues ) 00415 { 00416 $unicodeValue = hexdec( substr( $line, $hexPos, 4 ) ); 00417 $unicodeData = array( 'value' => $unicodeValue, 00418 'type' => 'unicode' ); 00419 // print( "unicode U+ '$unicodeValue'\n" ); 00420 } 00421 $pos = $hexPos + 4; 00422 } 00423 else if ( strpos( $hexValues, $char ) !== false and 00424 $pos + 1 < $len and 00425 strpos( $hexValues, $line[$pos + 1] ) !== false ) 00426 { 00427 $hexPos = $pos; 00428 if ( $hexPos + 2 > $len ) 00429 { 00430 $col = $len; 00431 $this->warning( "Found ASCII value with " . ( 2 - ( $len - $hexPos ) ) . " missing hex numbers", 00432 array( 'file' => $filename, 00433 'from' => array( $linePos, $hexPos ) ) ); 00434 $pos = $hexPos; 00435 $failed = true; 00436 break; 00437 } 00438 $hasHexValues = true; 00439 for ( $offset = 0; $offset < 2; ++$offset ) 00440 { 00441 $hexChar = $line[$hexPos + $offset]; 00442 if ( $hexChar == ' ' or 00443 $hexChar == "\t" ) 00444 { 00445 $col = $hexPos + $offset; 00446 $hasHexValues = false; 00447 $this->warning( "Found ASCII value with " . ( 2 - $offset ) . " missing hex numbers", 00448 array( 'file' => $filename, 00449 'from' => array( $linePos, $hexPos ), 00450 'to' => array( $linePos, $hexPos + $offset ) ) ); 00451 $pos = $hexPos + $offset; 00452 $failed = true; 00453 break; 00454 } 00455 if ( strpos( $hexValues, $hexChar ) === false ) 00456 { 00457 $col = $hexPos + $offset; 00458 $hasHexValues = false; 00459 $this->warning( "Found ASCII value with invalid hex numbers ($hexChar)", 00460 array( 'file' => $filename, 00461 'from' => array( $linePos, $hexPos ), 00462 'to' => array( $linePos, $hexPos + $offset ) ) ); 00463 $pos = $hexPos + $offset; 00464 $failed = true; 00465 break; 00466 } 00467 } 00468 if ( $failed ) 00469 break; 00470 if ( $hasHexValues ) 00471 { 00472 $asciiValue = hexdec( substr( $line, $hexPos, 4 ) ); 00473 // print( "unicode ASCII '$asciiValue'\n" ); 00474 $unicodeData = array( 'value' => $asciiValue, 00475 'type' => 'ascii' ); 00476 } 00477 $pos = $hexPos + 2; 00478 } 00479 else if ( substr( $line, $pos, 6 ) == 'remove' ) 00480 { 00481 // print( "remove character\n" ); 00482 $unicodeData = array( 'value' => false, 00483 'type' => 'remove' ); 00484 $pos += 6; 00485 } 00486 else if ( substr( $line, $pos, 4 ) == 'keep' ) 00487 { 00488 // print( "keep character\n" ); 00489 $unicodeData = array( 'value' => true, 00490 'type' => 'keep' ); 00491 $pos += 4; 00492 } 00493 00494 if ( $unicodeData ) 00495 { 00496 // print( "data state: $state\n" ); 00497 // source, marker, range_input, range_marker, map_input, transpose_input, replace_input, transpose_modulo 00498 if ( $state == 'source' ) 00499 { 00500 if ( $unicodeData['type'] == 'string' and 00501 strlen( $unicodeData['value'] ) > 1 ) 00502 { 00503 $this->warning( "Text string with more than one character cannot be used as input value '" . $unicodeData['value'] . "'", 00504 array( 'file' => $filename, 00505 'from' => array( $linePos, $pos ) ) ); 00506 $failed = true; 00507 break; 00508 } 00509 $sourceValue = $this->extractUnicodeValue( $unicodeData ); 00510 $state = 'marker'; 00511 } 00512 else if ( $state == 'marker' ) 00513 { 00514 $this->warning( "Source value not expected, a source value has already been extracted at $line" . "[$pos]", 00515 array( 'file' => $filename, 00516 'from' => array( $linePos, $pos ) ) ); 00517 $failed = true; 00518 break; 00519 } 00520 else if ( $state == 'range_input' ) 00521 { 00522 if ( $unicodeData['type'] == 'string' and 00523 strlen( $unicodeData['value'] ) > 1 ) 00524 { 00525 $this->warning( "Text string with more than one character cannot be used as range end value '" . $unicodeData['value'] . "'", 00526 array( 'file' => $filename, 00527 'from' => array( $linePos, $pos ) ) ); 00528 $failed = true; 00529 break; 00530 } 00531 $sourceEndValue = $this->extractUnicodeValue( $unicodeData ); 00532 $state = 'range_marker_or_modulo'; 00533 } 00534 else if ( $state == 'range_marker_or_modulo' or 00535 $state == 'range_marker' ) 00536 { 00537 $this->warning( "Range value not expected, a range value has already been extracted at $line" . "[$pos]", 00538 array( 'file' => $filename, 00539 'from' => array( $linePos, $pos ) ) ); 00540 $failed = true; 00541 break; 00542 } 00543 else if ( $state == 'map_input' ) 00544 { 00545 if ( !is_array( $destinationValues ) ) 00546 $destinationValues = array(); 00547 $destinationValues = array_merge( $destinationValues, 00548 $this->extractUnicodeValues( $unicodeData ) ); 00549 $type = 'map'; 00550 } 00551 else if ( $state == 'replace_input' ) 00552 { 00553 if ( !is_array( $destinationValues ) ) 00554 $destinationValues = array(); 00555 $destinationValues = array_merge( $destinationValues, 00556 $this->extractUnicodeValues( $unicodeData ) ); 00557 $type = 'replace'; 00558 } 00559 else if ( $state == 'transpose_input' ) 00560 { 00561 if ( $unicodeData['type'] == 'string' and 00562 strlen( $unicodeData['value'] ) > 1 ) 00563 { 00564 $this->warning( "Text string with more than one character cannot be used as transpose value '" . $unicodeData['value'] . "'", 00565 array( 'file' => $filename, 00566 'from' => array( $linePos, $pos ) ) ); 00567 $failed = true; 00568 break; 00569 } 00570 $transposeValue = $this->extractUnicodeValue( $unicodeData ); 00571 $type = 'transpose'; 00572 } 00573 else if ( $state == 'transpose_modulo' ) 00574 { 00575 if ( $unicodeData['type'] == 'string' and 00576 strlen( $unicodeData['value'] ) > 1 ) 00577 { 00578 $this->warning( "Text string with more than one character cannot be used as transpose modulo value '" . $unicodeData['value'] . "'", 00579 array( 'file' => $filename, 00580 'from' => array( $linePos, $pos ) ) ); 00581 $failed = true; 00582 break; 00583 } 00584 $moduloValue = $this->extractUnicodeValue( $unicodeData ); 00585 if ( $moduloValue == 0 ) 00586 { 00587 $this->error( "Modulo value of 0 is not allowed, 1 will be used instead", 00588 array( 'file' => $filename, 00589 'from' => array( $linePos, $pos ) ) ); 00590 // Note: There is another 0 check in generateSimpleMappingTable() 00591 } 00592 // print( "modulo value=$moduloValue\n" ); 00593 $state = 'range_marker'; 00594 } 00595 } 00596 else if ( !$failed ) 00597 { 00598 // print( "command state: $state\n" ); 00599 // source, marker, range_input, range_marker, map_input, transpose_input, replace_input 00600 if ( $state == 'source' ) 00601 { 00602 if ( $char == '=' ) 00603 { 00604 $this->warning( "Cannot use map marker $char without prior character value", 00605 array( 'file' => $filename, 00606 'from' => array( $linePos, $pos ) ) ); 00607 $failed = true; 00608 break; 00609 } 00610 else if ( $char == '+' or 00611 $char == '-' ) 00612 { 00613 $this->warning( "Cannot use range marker $char without prior character value", 00614 array( 'file' => $filename, 00615 'from' => array( $linePos, $pos ) ) ); 00616 $failed = true; 00617 break; 00618 } 00619 else 00620 { 00621 $this->warning( "Unknown character '$char', expecting input value", 00622 array( 'file' => $filename, 00623 'from' => array( $linePos, $pos ) ) ); 00624 $failed = true; 00625 break; 00626 } 00627 } 00628 else if ( $state == 'marker' ) 00629 { 00630 if ( $char == '=' ) 00631 { 00632 $state = 'map_input'; 00633 ++$pos; 00634 } 00635 else if ( $char == '-' ) 00636 { 00637 $state = 'range_input'; 00638 ++$pos; 00639 } 00640 else if ( $char == '+' ) 00641 { 00642 $this->warning( "Cannot use range marker $char without prior character value", 00643 array( 'file' => $filename, 00644 'from' => array( $linePos, $pos ) ) ); 00645 $failed = true; 00646 break; 00647 } 00648 else 00649 { 00650 $this->warning( "Unknown character '$char', expecting marker", 00651 array( 'file' => $filename, 00652 'from' => array( $linePos, $pos ) ) ); 00653 $failed = true; 00654 break; 00655 } 00656 } 00657 else if ( $state == 'range_marker_or_modulo' or 00658 $state == 'range_marker' ) 00659 { 00660 if ( $state == 'range_marker_or_modulo' and 00661 $char == '%' ) 00662 { 00663 // print( "found modulo marker\n" ); 00664 // Look for modulo value 00665 $state = 'transpose_modulo'; 00666 ++$pos; 00667 } 00668 else if ( $char == '=' ) 00669 { 00670 $state = 'replace_input'; 00671 ++$pos; 00672 } 00673 else if ( $char == '-' or 00674 $char == '+' ) 00675 { 00676 $transposeAdd = ( $char == '+' ? true : false ); 00677 $state = 'transpose_input'; 00678 ++$pos; 00679 } 00680 else 00681 { 00682 $this->warning( "Unknown character '$char', expecting range end value", 00683 array( 'file' => $filename, 00684 'from' => array( $linePos, $pos ) ) ); 00685 $failed = true; 00686 break; 00687 } 00688 } 00689 else if ( $state == 'map_input' ) 00690 { 00691 if ( $char == '=' ) 00692 { 00693 $this->warning( "Duplicate mapping marker $char", 00694 array( 'file' => $filename, 00695 'from' => array( $linePos, $pos ) ) ); 00696 $failed = true; 00697 break; 00698 } 00699 else if ( $char == '-' or 00700 $char == '+' ) 00701 { 00702 $this->warning( "Already mapping values, cannot use range/transpose marker $char", 00703 array( 'file' => $filename, 00704 'from' => array( $linePos, $pos ) ) ); 00705 $failed = true; 00706 break; 00707 } 00708 else 00709 { 00710 $this->warning( "Unknown character '$char', expecting output values", 00711 array( 'file' => $filename, 00712 'from' => array( $linePos, $pos ) ) ); 00713 $failed = true; 00714 break; 00715 } 00716 } 00717 else if ( $state == 'transpose_modulo' ) 00718 { 00719 if ( $char == '%' ) 00720 { 00721 $this->warning( "Modulo marker already used, cannot use $char", 00722 array( 'file' => $filename, 00723 'from' => array( $linePos, $pos ) ) ); 00724 $failed = true; 00725 break; 00726 } 00727 else if ( $char == '-' or 00728 $char == '+' ) 00729 { 00730 $this->warning( "Transpose marker $char used, but no modulo value has been found yet", 00731 array( 'file' => $filename, 00732 'from' => array( $linePos, $pos ) ) ); 00733 $failed = true; 00734 break; 00735 } 00736 else 00737 { 00738 $this->warning( "Unknown character '$char', expecting modulo value", 00739 array( 'file' => $filename, 00740 'from' => array( $linePos, $pos ) ) ); 00741 $failed = true; 00742 break; 00743 } 00744 } 00745 else if ( $state == 'transpose_input' ) 00746 { 00747 if ( $char == '=' ) 00748 { 00749 $this->warning( "Already transposing, cannot use mapping marker $char", 00750 array( 'file' => $filename, 00751 'from' => array( $linePos, $pos ) ) ); 00752 $failed = true; 00753 break; 00754 } 00755 else if ( $char == '-' or 00756 $char == '+' ) 00757 { 00758 $this->warning( "Duplicate transpose marker $char", 00759 array( 'file' => $filename, 00760 'from' => array( $linePos, $pos ) ) ); 00761 $failed = true; 00762 break; 00763 } 00764 else 00765 { 00766 $this->warning( "Unknown character '$char', expecting transpose value", 00767 array( 'file' => $filename, 00768 'from' => array( $linePos, $pos ) ) ); 00769 $failed = true; 00770 break; 00771 } 00772 } 00773 else if ( $state == 'replace_input' ) 00774 { 00775 if ( $char == '=' ) 00776 { 00777 $this->warning( "Already replacing, cannot use mapping marker $char", 00778 array( 'file' => $filename, 00779 'from' => array( $linePos, $pos ) ) ); 00780 $failed = true; 00781 break; 00782 } 00783 else if ( $char == '-' or 00784 $char == '+' ) 00785 { 00786 $this->warning( "Already replacing, cannot use transpose marker $char", 00787 array( 'file' => $filename, 00788 'from' => array( $linePos, $pos ) ) ); 00789 $failed = true; 00790 break; 00791 } 00792 else 00793 { 00794 $this->warning( "Unknown character '$char', expecting replace value", 00795 array( 'file' => $filename, 00796 'from' => array( $linePos, $pos ) ) ); 00797 $failed = true; 00798 break; 00799 } 00800 } 00801 } 00802 } 00803 if ( !$failed ) 00804 { 00805 if ( $identifier ) 00806 { 00807 // print( "\nGot type '$type'\n" ); 00808 // if ( is_array( $destinationValues ) ) 00809 // $destinationValues = array_diff( $destinationValues, array( '' ) ); 00810 00811 if ( !isset( $tbl[$identifier] ) ) 00812 $tbl[$identifier] = array(); 00813 00814 if ( $type == 'map' ) 00815 { 00816 // print( "***mapping***:\n" . $sourceValue . ' => ' . implode( ', ', $destinationValues ) . "\n\n" ); 00817 $this->appendDirectMapping( $tbl[$identifier], $identifier, $sourceValue, $destinationValues ); 00818 } 00819 else if ( $type == 'replace' ) 00820 { 00821 // print( "***replacing***:\n" . $sourceValue . ' - ' . $sourceEndValue . ' => ' . implode( ', ', $destinationValues ) . "\n\n" ); 00822 $this->appendReplaceMapping( $tbl[$identifier], $identifier, $sourceValue, $sourceEndValue, $destinationValues ); 00823 } 00824 else if ( $type == 'transpose' ) 00825 { 00826 // print( "***transposing***:\n" . $sourceValue . ' - ' . $sourceEndValue . ' % ' . $moduloValue . ' + ' . $transposeValue . "\n\n" ); 00827 $this->appendTransposeMapping( $tbl[$identifier], $identifier, $sourceValue, $sourceEndValue, $transposeValue, $transposeAdd, $moduloValue ); 00828 } 00829 } 00830 // else 00831 // { 00832 // print( "No identifier found yet, skipping entry!!!!!!!!!!\n" ); 00833 // } 00834 } 00835 else 00836 { 00837 // $this->warning( "Failed adding mapper", 00838 // array( 'file' => $filename, 00839 // 'from' => array( $linePos, $pos ) ) ); 00840 } 00841 } 00842 } 00843 00844 // Here we read more data from the file, appending to 00845 // the $buffer variable 00846 if ( !feof( $fd ) ) 00847 { 00848 $buffer .= fread( $fd, 4096 ); 00849 00850 // Make sure we have Unix endline characters 00851 $buffer = preg_replace( "#(\r\n|\r|\n)#", "\n", $buffer ); 00852 } 00853 ++$i; 00854 } 00855 00856 fclose( $fd ); 00857 00858 $this->TransformationTables = array_merge( $this->TransformationTables, $tbl ); 00859 } 00860 00861 /*! 00862 \private 00863 Appends a mapping from one value to another. 00864 \param $block Current block it is working on 00865 \param $identifier The current identifier it is working on 00866 \param $sourceValue The original value 00867 \param $destinationValues The value it should be mapped to 00868 */ 00869 function appendDirectMapping( &$block, $identifier, $sourceValue, $destinationValues ) 00870 { 00871 $count = count( $block ); 00872 if ( count( $destinationValues ) == 1 ) 00873 $destinationValues = array_pop( $destinationValues ); 00874 if ( isset( $block[$count - 1] ) and 00875 $block[$count - 1][0] == self::TYPE_DIRECT and 00876 $block[$count - 1][2] == $identifier ) 00877 { 00878 $block[$count - 1][1][$sourceValue] = $destinationValues; 00879 } 00880 else 00881 { 00882 $block[] = array( self::TYPE_DIRECT, 00883 array( $sourceValue => $destinationValues ), 00884 $identifier ); 00885 00886 } 00887 } 00888 00889 /*! 00890 \private 00891 Appends a mapping for a range of values into a specific value 00892 \param $block Current block it is working on 00893 \param $identifier The current identifier it is working on 00894 \param $sourceValue The start of the original value 00895 \param $sourceEndValue The ned of the original value 00896 \param $destinationValues The value it should be mapped to 00897 */ 00898 function appendReplaceMapping( &$block, $identifier, $sourceValue, $sourceEndValue, $destinationValues ) 00899 { 00900 $count = count( $block ); 00901 if ( count( $destinationValues ) == 1 ) 00902 $destinationValues = array_pop( $destinationValues ); 00903 if ( isset( $block[$count - 1] ) and 00904 $block[$count - 1][0] == self::TYPE_REPLACE and 00905 $block[$count - 1][2] == $identifier ) 00906 { 00907 $block[$count - 1][1][] = array( $sourceValue, $sourceEndValue, $destinationValues ); 00908 } 00909 else 00910 { 00911 $block[] = array( self::TYPE_REPLACE, 00912 array( array( $sourceValue, $sourceEndValue, $destinationValues ) ), 00913 $identifier ); 00914 00915 } 00916 } 00917 00918 /*! 00919 \private 00920 Appends a mapping for characters by transposing them up or down. 00921 \param $block Current block it is working on 00922 \param $identifier The current identifier it is working on 00923 \param $sourceValue The start of the original value 00924 \param $sourceEndValue The ned of the original value 00925 \param $transposeValue How much to transpose the values 00926 \param $addValue If \c true the $transposeValue is added to the range if not it is subtracted. 00927 */ 00928 function appendTransposeMapping( &$block, $identifier, $sourceValue, $sourceEndValue, $transposeValue, $addValue, $moduloValue ) 00929 { 00930 $count = count( $block ); 00931 if ( isset( $block[$count - 1] ) and 00932 $block[$count - 1][0] == self::TYPE_RANGE and 00933 $block[$count - 1][2] == $identifier ) 00934 { 00935 $block[$count - 1][1][] = array( $sourceValue, $sourceEndValue, $addValue ? $transposeValue : -$transposeValue, $moduloValue ); 00936 } 00937 else 00938 { 00939 $block[] = array( self::TYPE_RANGE, 00940 array( array( $sourceValue, $sourceEndValue, $addValue ? $transposeValue : -$transposeValue, $moduloValue ) ), 00941 $identifier ); 00942 00943 } 00944 } 00945 00946 /*! 00947 \private 00948 \return The first unicod value for the data entry \a $data. 00949 */ 00950 function extractUnicodeValue( $data ) 00951 { 00952 $type = $data['type']; 00953 if ( $type == 'string' ) 00954 { 00955 $list = $this->ISOUnicodeCodec->convertString( $data['value'][0] ); 00956 return $list[0]; 00957 } 00958 else if ( $type == 'ascii' ) 00959 { 00960 return $data['value']; 00961 } 00962 else if ( $type == 'unicode' ) 00963 { 00964 return $data['value']; 00965 } 00966 else if ( $type == 'remove' ) 00967 { 00968 return false; 00969 } 00970 else if ( $type == 'keep' ) 00971 { 00972 return true; 00973 } 00974 return null; 00975 } 00976 00977 /*! 00978 \private 00979 \return The unicode values for the data entry \a $data. 00980 */ 00981 function extractUnicodeValues( $data ) 00982 { 00983 $type = $data['type']; 00984 if ( $type == 'string' ) 00985 { 00986 return $this->ISOUnicodeCodec->convertString( $data['value'] ); 00987 } 00988 else if ( $type == 'ascii' ) 00989 { 00990 return array( $data['value'] ); 00991 } 00992 else if ( $type == 'unicode' ) 00993 { 00994 return array( $data['value'] ); 00995 } 00996 else if ( $type == 'remove' ) 00997 { 00998 return array( false ); 00999 } 01000 else if ( $type == 'keep' ) 01001 { 01002 return array( true ); 01003 } 01004 return array(); 01005 } 01006 01007 /*! 01008 \private 01009 Goes trough all entries in \a $table and if it finds identifier references 01010 it will fetch the table for that identifier and merge in the current one. 01011 \return The expanded table. 01012 */ 01013 function expandInheritance( $table ) 01014 { 01015 $newTable = array(); 01016 foreach ( $table as $tableItem ) 01017 { 01018 if ( is_string( $tableItem ) ) 01019 { 01020 $identifier = $tableItem; 01021 $subTable = $this->mappingTable( $identifier ); 01022 if ( !$subTable ) 01023 { 01024 eZDebug::writeError( "Failed to fetch mapping table for identifier: '$identifier'" ); 01025 } 01026 else 01027 { 01028 $subTable = $this->expandInheritance( $subTable ); 01029 $newTable = array_merge( $newTable, $subTable ); 01030 } 01031 } 01032 else 01033 { 01034 $newTable[] = $tableItem; 01035 } 01036 } 01037 return $newTable; 01038 } 01039 01040 /*! 01041 Turns the character list $list into an array with ordinal values 01042 \param $list Can be on of these types: 01043 - String - each character is turned into an ordinal value 01044 - Numeric - the numeric is used as ordinal value 01045 - Boolean - means no character 01046 - Array - each element is turned into an ordinal value by recursion 01047 */ 01048 function ordinalValues( $table, $list ) 01049 { 01050 $ordinals = array(); 01051 if ( is_string( $list ) ) 01052 { 01053 $len = strlen( $list ); 01054 for ( $offset = 0; $offset < $len; ++$offset ) 01055 { 01056 $ordinals[] = ord( $list[$offset] ); 01057 } 01058 } 01059 else if ( is_numeric( $list ) ) 01060 { 01061 $ordinals[] = $list; 01062 } 01063 else if ( is_array( $list ) ) 01064 { 01065 foreach ( $list as $item ) 01066 { 01067 $ordinals = array_merge( $ordinals, eZCodeMapper::ordinalValues( $table, $item ) ); 01068 } 01069 } 01070 $ordinals = eZCodeMapper::mapOrdinals( $table, $ordinals ); 01071 return $ordinals; 01072 } 01073 01074 /*! 01075 Goes trough each ordinal in \a $ordinals and sees if there is mapping for it. 01076 If it is the mapping is applied and used as the new ordinal, if the mapping refers to 01077 an array it will be mapped recursively. 01078 */ 01079 function mapOrdinals( $table, $ordinals ) 01080 { 01081 $mappedOrdinals = array(); 01082 foreach ( $ordinals as $ordinal ) 01083 { 01084 while ( !is_array( $ordinal ) and isset( $table[$ordinal] ) ) 01085 { 01086 $ordinal = $table[$ordinal]; 01087 if ( is_array( $ordinal ) ) 01088 { 01089 $ordinal = eZCodeMapper::mapOrdinals( $table, $ordinal ); 01090 } 01091 } 01092 if ( is_array( $ordinal ) ) 01093 $mappedOrdinals = array_merge( $mappedOrdinals, $ordinal ); 01094 else 01095 $mappedOrdinals[] = $ordinal; 01096 } 01097 return $mappedOrdinals; 01098 } 01099 01100 /*! 01101 Goes trough all to codes in the mapping table \a $unicodeMap and maps 01102 those that match \a $fromCode into \a $toCode. 01103 01104 \return \a $unicodeMap 01105 */ 01106 protected function mapExistingCodes( $unicodeMap, $fromCode, $toCode ) 01107 { 01108 foreach ( $unicodeMap as $from => $to ) 01109 { 01110 if ( is_array( $to ) ) 01111 { 01112 $newTo = array(); 01113 foreach ( $to as $ordinal ) 01114 { 01115 if ( $ordinal == $fromCode ) 01116 { 01117 $newTo = array_merge( $newTo, array( $toCode ) ); 01118 } 01119 else 01120 { 01121 $newTo[] = $ordinal; 01122 } 01123 } 01124 $unicodeMap[$from] = $newTo; 01125 } 01126 else if ( $to == $fromCode ) 01127 { 01128 $unicodeMap[$from] = $toCode; 01129 } 01130 } 01131 return $unicodeMap; 01132 } 01133 01134 /*! 01135 Goes trough the mapping rules in the table \a $table and generates a simple 01136 mapping table which maps from one Unicode value to another (or array of values). 01137 01138 The generation uses backward and forward propagation of the defined mappings 01139 to get the proper end result of a given value. 01140 01141 \note This method can take a while if lots of rules are used 01142 */ 01143 function generateSimpleMappingTable( $table, $allowedRanges ) 01144 { 01145 if ( !is_array( $table ) ) 01146 return false; 01147 $unicodeMap = array(); 01148 foreach ( $table as $tableItem ) 01149 { 01150 $type = $tableItem[0]; 01151 $item = $tableItem[1]; 01152 if ( isset( $tableItem[2] ) ) 01153 { 01154 $identifier = $tableItem[2]; 01155 // print( "identifier: $identifier\n" ); 01156 } 01157 if ( $type == self::TYPE_DIRECT ) 01158 { 01159 foreach ( $item as $fromCode => $toCode ) 01160 { 01161 // print( "from: $fromCode, to: $toCode\n" ); 01162 // if ( $fromCode == 1026 ) 01163 // { 01164 // print( "<pre>oldcode<br/>" ); var_dump( $toCode ); print( "</pre>" ); 01165 // } 01166 $toCode = eZCodeMapper::ordinalValues( $unicodeMap, $toCode ); 01167 // if ( $fromCode == 1026 ) 01168 // { 01169 // print( "<pre>newcode<br/>" ); var_dump( $toCode ); print( "</pre>" ); 01170 // } 01171 if ( count( $allowedRanges ) == 0 ) 01172 { 01173 if ( count( $toCode ) == 1 ) 01174 $toCode = $toCode[0]; 01175 // If the mapping already exists we skip it 01176 if ( isset( $unicodeMap[$fromCode] ) ) 01177 continue; 01178 01179 $unicodeMap[$fromCode] = $toCode; 01180 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $fromCode, $toCode ); 01181 } 01182 else 01183 { 01184 $allowed = false; 01185 foreach ( $allowedRanges as $allowedRange ) 01186 { 01187 if ( $fromCode >= $allowedRange[0] and 01188 $fromCode <= $allowedRange[1] ) 01189 { 01190 $allowed = true; 01191 break; 01192 } 01193 } 01194 if ( !$allowed ) 01195 continue; 01196 01197 $toCodeList = $toCode; 01198 $newToCodeList = array(); 01199 foreach ( $toCodeList as $toCode ) 01200 { 01201 if ( is_bool( $toCode ) ) 01202 { 01203 $newToCodeList[] = $toCode; 01204 continue; 01205 } 01206 foreach ( $allowedRanges as $allowedRange ) 01207 { 01208 if ( $toCode >= $allowedRange[0] and 01209 $toCode <= $allowedRange[1] ) 01210 { 01211 break; 01212 } 01213 } 01214 if ( $allowed ) 01215 { 01216 $newToCodeList[] = $toCode; 01217 } 01218 } 01219 $toCode = $newToCodeList; 01220 if ( count( $toCode ) > 0 ) 01221 { 01222 if ( count( $toCode ) == 1 ) 01223 $toCode = $toCode[0]; 01224 01225 // If the mapping already exists we skip it 01226 if ( isset( $unicodeMap[$fromCode] ) ) 01227 continue; 01228 01229 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $fromCode, $toCode ); 01230 01231 $unicodeMap[$fromCode] = $toCode; 01232 } 01233 } 01234 } 01235 } 01236 else if ( $type == self::TYPE_RANGE ) 01237 { 01238 foreach ( $item as $rangeItem ) 01239 { 01240 $start = $rangeItem[0]; 01241 $stop = $rangeItem[1]; 01242 if ( $start > $stop ) 01243 { 01244 $tmp = $stop; 01245 $stop = $start; 01246 $start = $tmp; 01247 } 01248 $add = $rangeItem[2]; 01249 $modulo = $rangeItem[3]; 01250 // Sanity-check, to avoid infinite loops 01251 if ( $modulo == 0 ) 01252 $modulo = 1; 01253 for ( $i = $start; $i <= $stop; $i += $modulo ) 01254 { 01255 if ( count( $allowedRanges ) == 0 ) 01256 { 01257 $allowed = true; 01258 } 01259 else 01260 { 01261 $allowed = false; 01262 foreach ( $allowedRanges as $allowedRange ) 01263 { 01264 if ( $i >= $allowedRange[0] and 01265 $i <= $allowedRange[1] ) 01266 { 01267 $allowed = true; 01268 break; 01269 } 01270 } 01271 if ( !$allowed ) 01272 continue; 01273 } 01274 01275 $replace = $i + $add; 01276 $replace = eZCodeMapper::ordinalValues( $unicodeMap, $replace ); 01277 if ( count( $allowedRanges ) == 0 ) 01278 { 01279 if ( count( $replace ) == 0 ) 01280 $replace = false; 01281 else if ( count( $replace ) == 1 ) 01282 $replace = $replace[0]; 01283 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace ); 01284 01285 // If the mapping already exists we skip it 01286 if ( isset( $unicodeMap[$i] ) ) 01287 continue; 01288 01289 $unicodeMap[$i] = $replace; 01290 } 01291 else 01292 { 01293 $newReplace = array(); 01294 foreach ( $allowedRanges as $allowedRange ) 01295 { 01296 foreach ( $replace as $replaceOrdinal ) 01297 { 01298 if ( $replaceOrdinal >= $allowedRange[0] and 01299 $replaceOrdinal <= $allowedRange[1] ) 01300 { 01301 $newReplace[] = $replaceOrdinal; 01302 } 01303 } 01304 } 01305 if ( count( $newReplace ) == 0 ) 01306 $replace = false; 01307 else if ( count( $newReplace ) == 1 ) 01308 $replace = $newReplace[0]; 01309 else 01310 $replace = $newReplace; 01311 01312 // If the mapping already exists we skip it 01313 if ( isset( $unicodeMap[$i] ) ) 01314 continue; 01315 01316 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace ); 01317 $unicodeMap[$i] = $replace; 01318 } 01319 } 01320 } 01321 } 01322 else if ( $type == self::TYPE_REPLACE ) 01323 { 01324 foreach ( $item as $rangeItem ) 01325 { 01326 $start = $rangeItem[0]; 01327 $stop = $rangeItem[1]; 01328 if ( $start > $stop ) 01329 { 01330 $tmp = $stop; 01331 $stop = $start; 01332 $start = $tmp; 01333 } 01334 $replace = $rangeItem[2]; 01335 $replace = eZCodeMapper::ordinalValues( $unicodeMap, $replace ); 01336 if ( count( $allowedRanges ) == 0 ) 01337 { 01338 if ( count( $replace ) == 0 ) 01339 $replace = false; 01340 else if ( count( $replace ) == 1 ) 01341 $replace = $replace[0]; 01342 for ( $i = $start; $i <= $stop; ++$i ) 01343 { 01344 // If the mapping already exists we skip it 01345 if ( isset( $unicodeMap[$i] ) ) 01346 continue; 01347 01348 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace ); 01349 $unicodeMap[$i] = $replace; 01350 } 01351 } 01352 else 01353 { 01354 $newReplace = array(); 01355 foreach ( $allowedRanges as $allowedRange ) 01356 { 01357 foreach ( $replace as $replaceOrdinal ) 01358 { 01359 if ( $replaceOrdinal >= $allowedRange[0] and 01360 $replaceOrdinal <= $allowedRange[1] ) 01361 { 01362 $newReplace[] = $replaceOrdinal; 01363 } 01364 } 01365 } 01366 if ( count( $newReplace ) == 0 ) 01367 $replace = false; 01368 else if ( count( $newReplace ) == 1 ) 01369 $replace = $newReplace[0]; 01370 else 01371 $replace = $newReplace; 01372 for ( $i = $start; $i <= $stop; ++$i ) 01373 { 01374 $allowed = false; 01375 foreach ( $allowedRanges as $allowedRange ) 01376 { 01377 if ( $i >= $allowedRange[0] and 01378 $i <= $allowedRange[1] ) 01379 { 01380 $allowed = true; 01381 break; 01382 } 01383 } 01384 if ( $allowed ) 01385 { 01386 // If the mapping already exists we skip it 01387 if ( isset( $unicodeMap[$i] ) ) 01388 continue; 01389 01390 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace ); 01391 $unicodeMap[$i] = $replace; 01392 } 01393 } 01394 } 01395 } 01396 } 01397 } 01398 return $unicodeMap; 01399 } 01400 01401 /*! 01402 Generates a unicode mapping table for idenfier \a $idenfier. 01403 01404 \param $identifier Is either a single identifier string or a 01405 an array with identifiers. 01406 \return The unicode mapping table for all defined identifiers 01407 */ 01408 function generateMappingCode( $identifier ) 01409 { 01410 if ( !is_array( $identifier ) ) 01411 $identifier = array( $identifier ); 01412 $table = $this->expandInheritance( $identifier ); 01413 01414 // We allow all characters for now 01415 $allowedRanges = array(); 01416 $simpleTable = $this->generateSimpleMappingTable( $table, $allowedRanges ); 01417 ksort( $simpleTable ); 01418 return $simpleTable; 01419 } 01420 01421 /*! 01422 Generates a mapping table for the character set $charset. 01423 This will mapping table will only work for that character set but will be much faster 01424 and be fed directly to the strtr() PHP function. 01425 \return the table or \c false if something failed. 01426 */ 01427 function generateCharsetMappingTable( $unicodeTable, $charset ) 01428 { 01429 //include_once( 'lib/ezi18n/classes/eztextcodec.php' ); 01430 01431 $codec = eZTextCodec::instance( 'unicode', $charset ); 01432 if ( !$codec ) 01433 { 01434 eZDebug::writeError( "Failed to create textcodec for charset '$charset'" ); 01435 return false; 01436 } 01437 01438 $charsetTable = array(); 01439 foreach ( $unicodeTable as $match => $replacement ) 01440 { 01441 $matchLocal = $codec->convertString( array( $match ) ); 01442 if ( is_array( $replacement ) ) 01443 { 01444 $replacementLocal = $codec->convertString( $replacement ); 01445 } 01446 else 01447 { 01448 $replacementLocal = $codec->convertString( array( $replacement ) ); 01449 } 01450 $charsetTable[$matchLocal] = $replacementLocal; 01451 } 01452 01453 // Make sure longer string entries are placed before the shorter ones 01454 // This is very important when working with utf8 which have 01455 // variable length for characters 01456 krsort( $charsetTable ); 01457 return $charsetTable; 01458 } 01459 01460 /*! 01461 Decodes a command into transformation rules. 01462 \param $name Name of the command 01463 \param $parameters Array of parameters for the command 01464 \return An array with transformation rules. 01465 */ 01466 function decodeCommand( $name, $parameters ) 01467 { 01468 $names = $this->ruleNames(); 01469 $rules = array(); 01470 switch ( $name ) 01471 { 01472 // Special code handlers 01473 case 'url_cleanup_iri': 01474 case 'url_cleanup': 01475 case 'url_cleanup_compat': 01476 case 'identifier_cleanup': 01477 { 01478 } break; 01479 01480 case 'normalize': 01481 case 'search_normalize': 01482 case 'decompose': 01483 case 'diacritical': 01484 case 'lowercase': 01485 case 'uppercase': 01486 case 'search_cleanup': 01487 { 01488 if ( count( $parameters ) == 0 ) 01489 { 01490 // Include all normalize rules 01491 foreach ( $names as $rule ) 01492 { 01493 if ( preg_match( '#_'. $name . '$#', $rule ) ) 01494 $rules[] = $rule; 01495 } 01496 } 01497 else 01498 { 01499 foreach ( $parameters as $parameter ) 01500 { 01501 $rule = $parameter . '_' . $name; 01502 if ( in_array( $rule, $names ) ) 01503 $rules[] = $rule; 01504 } 01505 } 01506 } break; 01507 01508 case 'transform': 01509 case 'transliterate': 01510 { 01511 $dividers = array( 'transform' => '_to_', 01512 'transliterate' => '_transliterate_' ); 01513 $divider = $dividers[$name]; 01514 if ( count( $parameters ) == 0 ) 01515 { 01516 // Include all transformation rules 01517 foreach ( $names as $rule ) 01518 { 01519 if ( preg_match( '#^[a-zA-Z][a-zA-Z0-9-]+'. $divider . '[a-zA-Z][a-zA-Z0-9-]+$#', $rule ) ) 01520 $rules[] = $rule; 01521 } 01522 } 01523 else if ( count( $parameters ) == 2 ) 01524 { 01525 $rule = $parameters[0] . $divider . $parameters[1]; 01526 if ( in_array( $rule, $names ) ) 01527 $rules[] = $rule; 01528 } 01529 } break; 01530 01531 default: 01532 { 01533 $ini = eZINI::instance( 'transform.ini' ); 01534 $commands = $ini->variable( 'Extensions', 'Commands' ); 01535 if ( isset( $commands[$name] ) ) 01536 { 01537 break; 01538 } 01539 eZDebug::writeError( "Unknown command '$name'", 01540 'eZCharTransform::decodeCommand' ); 01541 } break; 01542 } 01543 return $rules; 01544 } 01545 01546 /*! 01547 Generates PHP code for the command \a $command. 01548 \param $charsetName The name of the charset the text will be in, 01549 this can be used to generate different code for different charsets. 01550 \return A string containing PHP code or \c false if not supported. 01551 */ 01552 function generateCommandCode( $command, $charsetName ) 01553 { 01554 if ( $command['command'] == 'url_cleanup_iri' ) 01555 { 01556 $charsetNameTxt = var_export( $charsetName, true ); 01557 $code = "\$text = eZCharTransform::commandUrlCleanupIRI( \$text, $charsetNameTxt );\n"; 01558 return $code; 01559 } 01560 else if ( $command['command'] == 'url_cleanup' ) 01561 { 01562 $charsetNameTxt = var_export( $charsetName, true ); 01563 $code = "\$text = eZCharTransform::commandUrlCleanup( \$text, $charsetNameTxt );\n"; 01564 return $code; 01565 } 01566 else if ( $command['command'] == 'url_cleanup_compat' ) 01567 { 01568 $charsetNameTxt = var_export( $charsetName, true ); 01569 $code = "\$text = eZCharTransform::commandUrlCleanupCompat( \$text, $charsetNameTxt );\n"; 01570 return $code; 01571 } 01572 else if ( $command['command'] == 'identifier_cleanup' ) 01573 { 01574 $code = ( "\$text = strtolower( \$text );\n" . 01575 "\$text = preg_replace( array( \"#[^a-z0-9_ ]#\",\n" . 01576 " \"/ /\",\n" . 01577 " \"/__+/\",\n" . 01578 " \"/^_|_$/\" ),\n" . 01579 " array( \" \",\n" . 01580 " \"_\",\n" . 01581 " \"_\",\n" . 01582 " \"\" ),\n" . 01583 " \$text );\n" ); 01584 return $code; 01585 } 01586 else if ( $command['command'] == 'search_cleanup' ) 01587 { 01588 $code = ''; 01589 $nonCJKCharsets = $this->nonCJKCharsets(); 01590 if ( !in_array( $charsetName, $nonCJKCharsets ) ) 01591 { 01592 $code .= ( '// add N-Gram(N=2) chinese / japanese / korean multibyte characters' . "\n" . 01593 '//include_once( \'lib/ezi18n/classes/eztextcodec.php\' );' . "\n" . 01594 '$codec = eZTextCodec::instance( false, \'unicode\' );' . "\n" . 01595 "\n" . 01596 '$unicodeValueArray = $codec->convertString( $text );' . "\n" . 01597 "\n" . 01598 '$normalizedTextArray = array();' . "\n" . 01599 '$bFlag = false;' . "\n" . 01600 'foreach ( array_keys( $unicodeValueArray ) as $valueKey )' . "\n" . 01601 '{' . "\n" . 01602 ' // Check for word characters that should be broken up for search' . "\n" . 01603 ' if ( ( $unicodeValueArray[$valueKey] >= 12289 and' . "\n" . 01604 ' $unicodeValueArray[$valueKey] <= 12542 ) or' . "\n" . 01605 ' ( $unicodeValueArray[$valueKey] >= 13312 and' . "\n" . 01606 ' $unicodeValueArray[$valueKey] <= 40863 ) or' . "\n" . 01607 ' ( $unicodeValueArray[$valueKey] >= 44032 and' . "\n" . 01608 ' $unicodeValueArray[$valueKey] <= 55203 ) )' . "\n" . 01609 ' {' . "\n" . 01610 ' if ( $bFlag )' . "\n" . 01611 ' {' . "\n" . 01612 ' $normalizedTextArray[] = $unicodeValueArray[$valueKey];' . "\n" . 01613 ' }' . "\n" . 01614 ' $normalizedTextArray[] = 32; // A space' . "\n" . 01615 ' $normalizedTextArray[] = $unicodeValueArray[$valueKey];' . "\n" . 01616 ' $bFlag = true;' . "\n" . 01617 ' }' . "\n" . 01618 ' else' . "\n" . 01619 ' {' . "\n" . 01620 ' if ( $bFlag )' . "\n" . 01621 ' {' . "\n" . 01622 ' $normalizedTextArray[] = 32; // A space' . "\n" . 01623 ' }' . "\n" . 01624 ' $normalizedTextArray[] = $unicodeValueArray[$valueKey];' . "\n" . 01625 ' $bFlag = false;' . "\n" . 01626 ' }' . "\n" . 01627 '}' . "\n" . 01628 'if ( $bFlag )' . "\n" . 01629 '{' . "\n" . 01630 ' $normalizedTextArray[count($normalizedTextArray)-1]=32;' . "\n" . 01631 '}' . "\n" . 01632 '$revCodec = eZTextCodec::instance( \'unicode\', false ); // false means use internal charset' . "\n" . 01633 '$text = $revCodec->convertString( $normalizedTextArray );' . "\n" ); 01634 } 01635 $code .= ( '$text = preg_replace( array( "#(\.){2,}#",' . "\n" . 01636 ' "#^\.#",' . "\n" . 01637 ' "#\s\.#",' . "\n" . 01638 ' "#\.\s#",' . "\n" . 01639 ' "#\.$#",' . "\n" . 01640 ' "#([^0-9])%#" ),' . "\n" . 01641 ' array( " ",' . "\n" . 01642 ' " ",' . "\n" . 01643 ' " ",' . "\n" . 01644 ' " ",' . "\n" . 01645 ' " ",' . "\n" . 01646 ' " " ),' . "\n" . 01647 ' $text );' . "\n" . 01648 '$ini = eZINI::instance();' . "\n" . 01649 'if ( $ini->variable( \'SearchSettings\', \'EnableWildcard\' ) != \'true\' )' . "\n" . 01650 '{' . "\n" . 01651 ' $text = str_replace( "*", " ", $text );' . "\n" . 01652 '}' . "\n" . 01653 '$charset = eZTextCodec::internalCharset();' . "\n" . 01654 '$hasUTF8 = ( $charset == "utf-8" );' . "\n" . 01655 "\n" . 01656 'if ( $hasUTF8 )' . "\n" . 01657 '{' . "\n" . 01658 ' $text = preg_replace( "#(\s+)#u", " ", $text );' . "\n" . 01659 '}' . "\n" . 01660 'else' . "\n" . 01661 '{' . "\n" . 01662 ' $text = preg_replace( "#(\s+)#", " ", $text );' . "\n" . 01663 '}' ); 01664 01665 return $code; 01666 } 01667 else 01668 { 01669 $ini = eZINI::instance( 'transform.ini' ); 01670 $commands = $ini->variable( 'Extensions', 'Commands' ); 01671 if ( isset( $commands[$command['command']] ) ) 01672 { 01673 list( $path, $className ) = split( ":", $commands[$command['command']], 2 ); 01674 if ( file_exists( $path ) ) 01675 { 01676 $charsetNameTxt = var_export( $charsetName, true ); 01677 $commandTxt = var_export( $command['command'], true ); 01678 $pathTxt = var_export( $path, true ); 01679 $code = "include_once( $pathTxt );\n\$text = $className::executeCommand( \$text, $commandTxt, $charsetNameTxt );\n"; 01680 return $code; 01681 } 01682 else 01683 { 01684 eZDebug::writeError( "Could not locate include file '$path' for transformation '" . $command['command'] . "'" ); 01685 } 01686 } 01687 } 01688 return false; 01689 } 01690 01691 /*! 01692 Executes custom PHP code for the command \a $command. 01693 \param $charsetName The name of the charset the text will be in, 01694 this can be used to execute different code for different charsets. 01695 \return \c true if the command is supported, \c false otherwise. 01696 */ 01697 function executeCommandCode( &$text, $command, $charsetName ) 01698 { 01699 if ( $command['command'] == 'url_cleanup_iri' ) 01700 { 01701 $text = eZCharTransform::commandUrlCleanupIRI( $text, $charsetName ); 01702 return true; 01703 } 01704 else if ( $command['command'] == 'url_cleanup' ) 01705 { 01706 $text = eZCharTransform::commandUrlCleanup( $text, $charsetName ); 01707 return true; 01708 } 01709 else if ( $command['command'] == 'url_cleanup_compat' ) 01710 { 01711 $text = eZCharTransform::commandUrlCleanupCompat( $text, $charsetName ); 01712 return true; 01713 } 01714 else if ( $command['command'] == 'identifier_cleanup' ) 01715 { 01716 $text = strtolower( $text ); 01717 $text = preg_replace( array( "#[^a-z0-9_ ]#", 01718 "/ /", 01719 "/__+/", 01720 "/^_|_$/" ), 01721 array( " ", 01722 "_", 01723 "_", 01724 "" ), 01725 $text ); 01726 return true; 01727 } 01728 else if ( $command['command'] == 'search_cleanup' ) 01729 { 01730 $nonCJKCharsets = $this->nonCJKCharsets(); 01731 if ( !in_array( $charsetName, $nonCJKCharsets ) ) 01732 { 01733 // 4 Add spaces after chinese / japanese / korean multibyte characters 01734 //include_once( 'lib/ezi18n/classes/eztextcodec.php' ); 01735 $codec = eZTextCodec::instance( false, 'unicode' ); 01736 01737 $unicodeValueArray = $codec->convertString( $text ); 01738 01739 $normalizedTextArray = array(); 01740 $bFlag = false; 01741 foreach ( array_keys( $unicodeValueArray ) as $valueKey ) 01742 { 01743 // Check for word characters that should be broken up for search 01744 if ( ( $unicodeValueArray[$valueKey] >= 12289 and 01745 $unicodeValueArray[$valueKey] <= 12542 ) or 01746 ( $unicodeValueArray[$valueKey] >= 13312 and 01747 $unicodeValueArray[$valueKey] <= 40863 ) or 01748 ( $unicodeValueArray[$valueKey] >= 44032 and 01749 $unicodeValueArray[$valueKey] <= 55203 ) ) 01750 { 01751 if ( $bFlag ) 01752 { 01753 $normalizedTextArray[] = $unicodeValueArray[$valueKey]; 01754 } 01755 $normalizedTextArray[] = 32; // A space 01756 $normalizedTextArray[] = $unicodeValueArray[$valueKey]; 01757 $bFlag = true; 01758 } 01759 else 01760 { 01761 if ( $bFlag ) 01762 { 01763 $normalizedTextArray[] = 32; // A space 01764 } 01765 $normalizedTextArray[] = $unicodeValueArray[$valueKey]; 01766 $bFlag = false; 01767 } 01768 } 01769 01770 if ( $bFlag ) 01771 { 01772 $normalizedTextArray[ count( $normalizedTextArray ) - 1 ] = 32; 01773 } 01774 01775 $revCodec = eZTextCodec::instance( 'unicode', false ); // false means use internal charset 01776 $text = $revCodec->convertString( $normalizedTextArray ); 01777 } 01778 01779 // Make sure dots inside words/numbers are kept, the rest is turned into space 01780 $text = preg_replace( array( "#(\.){2,}#", 01781 "#^\.#", 01782 "#\s\.#", 01783 "#\.\s#", 01784 "#\.$#", 01785 "#([^0-9])%#" ), // Keep only % after a number 01786 array( " ", 01787 " ", 01788 " ", 01789 " ", 01790 " ", 01791 "$1 " ), 01792 $text ); 01793 $ini = eZINI::instance(); 01794 if ( $ini->variable( 'SearchSettings', 'EnableWildcard' ) != 'true' ) 01795 { 01796 $text = str_replace( "*", " ", $text ); 01797 } 01798 $charset = eZTextCodec::internalCharset(); 01799 $hasUTF8 = ( $charset == "utf-8" ); 01800 01801 if ( $hasUTF8 ) 01802 { 01803 $text = preg_replace( "#(\s+)#u", " ", $text ); 01804 } 01805 else 01806 { 01807 $text = preg_replace( "#(\s+)#", " ", $text ); 01808 } 01809 01810 return true; 01811 } 01812 else 01813 { 01814 $ini = eZINI::instance( 'transform.ini' ); 01815 $commands = $ini->variable( 'Extensions', 'Commands' ); 01816 if ( isset( $commands[$command['command']] ) ) 01817 { 01818 list( $path, $className ) = split( ":", $commands[$command['command']], 2 ); 01819 if ( file_exists( $path ) ) 01820 { 01821 include_once( $path ); 01822 $text = call_user_func_array( array( $className, 'executeCommand' ), 01823 array( $text, $command['command'], $charsetName ) ); 01824 return true; 01825 } 01826 else 01827 { 01828 eZDebug::writeError( "Could not locate include file '$path' for transformation '" . $command['command'] . "'" ); 01829 } 01830 } 01831 } 01832 return false; 01833 } 01834 01835 /*! 01836 \return An array with charsets that are certain to not contain CJK characters. 01837 */ 01838 function nonCJKCharsets() 01839 { 01840 return array( 'adobe-standard-encoding', 01841 'cp437', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857', 01842 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 01843 'cp869', 'cp874', 01844 'dec-mcs', 'hp-roman8', 01845 'iso-8859-1', 'iso-8859-2', 'iso-8859-3', 'iso-8859-4', 'iso-8859-5', 01846 'iso-8859-6', 'iso-8859-7', 'iso-8859-8', 'iso-8859-9', 'iso-8859-10', 01847 'iso-8859-11', 'iso-8859-13', 'iso-8859-14', 'iso-8859-15', 01848 'koi8-r', 'koi8-u', 'macintosh', 'next', 'us-ascii', 01849 'windows-1250', 'windows-1251', 'windows-1252', 'windows-1253', 01850 'windows-1254', 'windows-1255', 'windows-1256', 'windows-1257', 01851 'windows-1258' ); 01852 } 01853 01854 /// \privatesection 01855 public $TransformationTables; 01856 public $TransformationFiles; 01857 public $ISOUnicodeCodec; 01858 } 01859 01860 ?>