|
eZ Publish
[trunk]
|
00001 <?php 00002 /** 00003 * File containing the eZCodeMapper class. 00004 * 00005 * @copyright Copyright (C) 1999-2012 eZ Systems AS. All rights reserved. 00006 * @license http://www.gnu.org/licenses/gpl-2.0.txt GNU General Public License v2 00007 * @version //autogentag// 00008 * @package lib 00009 */ 00010 00011 /*! 00012 \class eZCodeMapper ezcodemapper.php 00013 \ingroup eZI18N 00014 \brief Handles mapping of character codes 00015 00016 */ 00017 00018 class eZCodeMapper 00019 { 00020 const TYPE_DIRECT = 1; 00021 const TYPE_RANGE = 2; 00022 const TYPE_REPLACE = 3; 00023 00024 /*! 00025 Constructor 00026 */ 00027 function eZCodeMapper() 00028 { 00029 $this->TransformationTables = array(); 00030 $this->TransformationFiles = array(); 00031 } 00032 00033 /*! 00034 \return The mapping table for identifier \a $identifier or \c false if it is not found. 00035 */ 00036 function mappingTable( $identifier ) 00037 { 00038 if ( isset( $this->TransformationTables[$identifier] ) ) 00039 return $this->TransformationTables[$identifier]; 00040 return false; 00041 } 00042 00043 /*! 00044 \return An array with the names of rules which are currently available. 00045 */ 00046 function ruleNames() 00047 { 00048 return array_keys( $this->TransformationTables ); 00049 } 00050 00051 /*! 00052 Outputs error \a $text found in parsed file at position \a $position. 00053 */ 00054 function error( $text, $position = false ) 00055 { 00056 if ( $position ) 00057 { 00058 $str = $position['file'] . ':' . $position['from'][0] . ' C' . $position['from'][1]; 00059 if ( isset( $position['to'] ) ) 00060 $str .= ' -> L' . $position['to'][0] . ' C' . $position['to'][1]; 00061 $str .= ':'; 00062 } 00063 $str .= $text; 00064 if ( class_exists( 'ezcli' ) ) 00065 { 00066 $cli = eZCLI::instance(); 00067 $cli->error( $str ); 00068 } 00069 else 00070 { 00071 eZDebug::writeError( $str, __METHOD__ ); 00072 } 00073 } 00074 00075 /*! 00076 Outputs warning \a $text found in parsed file at position \a $position. 00077 */ 00078 function warning( $text, $position = false ) 00079 { 00080 if ( $position ) 00081 { 00082 $str = $position['file'] . ':' . $position['from'][0] . ' C' . $position['from'][1]; 00083 if ( isset( $position['to'] ) ) 00084 $str .= ' -> L' . $position['to'][0] . ' C' . $position['to'][1]; 00085 $str .= ':'; 00086 } 00087 $str .= $text; 00088 if ( class_exists( 'ezcli' ) ) 00089 { 00090 $cli = eZCLI::instance(); 00091 $cli->warning( $str ); 00092 } 00093 else 00094 { 00095 eZDebug::writeWarning( $str, __METHOD__ ); 00096 } 00097 } 00098 00099 /*! 00100 \return \c true if the transformation file is already loaded. 00101 */ 00102 function isTranformationLoaded( $name ) 00103 { 00104 return in_array( $name, $this->TransformationFiles ); 00105 } 00106 00107 /*! 00108 Loads all transformation files defined in \c transform.ini to the current 00109 mapper. It will also load any transformations found in extensions. 00110 00111 \param $currentCharset The name of the current charset in use. The caller must 00112 make sure this is not an alias by using eZCharsetInfo::realCharsetCode() 00113 \param $transformationGroup The transformation group which is currently used or \c false for none. 00114 */ 00115 function loadTransformationFiles( $currentCharset, $transformationGroup ) 00116 { 00117 $ini = eZINI::instance( 'transform.ini' ); 00118 $repositoryList = array( $ini->variable( 'Transformation', 'Repository' ) ); 00119 $files = $ini->variable( 'Transformation', 'Files' ); 00120 $extensions = $ini->variable( 'Transformation', 'Extensions' ); 00121 $repositoryList = array_merge( $repositoryList, 00122 eZExtension::expandedPathList( $extensions, 'transformations' ) ); 00123 00124 // Check if the current charset maps to a unicode group 00125 // If it does it can trigger loading of additional files 00126 $unicodeGroups = array(); 00127 $charsets = $ini->variable( 'Transformation', 'Charsets' ); 00128 foreach ( $charsets as $entry ) 00129 { 00130 list ( $charset, $group ) = explode( ';', $entry, 2 ); 00131 $charset = eZCharsetInfo::realCharsetCode( $charset ); 00132 if ( $charset == $currentCharset ) 00133 { 00134 if ( !in_array( $group, $unicodeGroups ) ) 00135 $unicodeGroups[] = $group; 00136 } 00137 } 00138 00139 // If we are using transformation groups then add that as 00140 // a unicode group. This causes it load transformation files 00141 // specific to that group. 00142 if ( $transformationGroup !== false ) 00143 $unicodeGroups[] = $transformationGroup; 00144 00145 // Add any extra files from the unicode groups 00146 foreach ( $unicodeGroups as $unicodeGroup ) 00147 { 00148 if ( $ini->hasGroup( $unicodeGroup ) ) 00149 { 00150 $files = array_merge( $files, $ini->variable( $unicodeGroup, 'Files' ) ); 00151 $extensions = $ini->variable( $unicodeGroup, 'Extensions' ); 00152 $repositoryList = array_merge( $repositoryList, 00153 eZExtension::expandedPathList( $extensions, 'transformations' ) ); 00154 } 00155 } 00156 00157 foreach ( $files as $file ) 00158 { 00159 // Only load files that are not currently loaded 00160 if ( $this->isTranformationLoaded( $file ) ) 00161 continue; 00162 00163 foreach ( $repositoryList as $repository ) 00164 { 00165 $trFile = $repository . '/' . $file; 00166 if ( file_exists( $trFile ) ) 00167 { 00168 $this->parseTransformationFile( $trFile, $file ); 00169 } 00170 } 00171 } 00172 } 00173 00174 /*! 00175 Parses the transformation file \a $filename and appends any rules it finds 00176 to the current rule list. 00177 \param $name The name of transformation file as it was requested, ie. without a path 00178 */ 00179 function parseTransformationFile( $filename, $name ) 00180 { 00181 // eZDebug::writeDebug( "Parsing file $filename" ); 00182 $tbl = array(); 00183 00184 $fd = fopen( $filename, "rb" ); 00185 if ( !$fd ) 00186 { 00187 $this->error( "Failed opening $filename" ); 00188 return false; 00189 } 00190 00191 $this->TransformationFiles[] = $name; 00192 00193 $this->ISOUnicodeCodec = eZTextCodec::instance( 'iso-8859-1', 'unicode' ); 00194 00195 $buffer = ''; 00196 $lineNum = 1; 00197 $i = 0; 00198 $hexValues = "0123456789abcdefABCDEF"; 00199 $identifier = false; 00200 00201 // The big funky parser starts here 00202 // It starts by reading a chunk of data from the file 00203 // then splits everything into an array with lines. 00204 // Then it traverses one line at a time looking for 00205 // identifiers and rules. Comments will be removed before the 00206 // line is parsed for identifiers and rules. 00207 00208 while ( !feof( $fd ) or strlen( $buffer ) > 0 ) 00209 { 00210 $lines = array(); 00211 $len = strlen( $buffer ); 00212 // Check if we have data in the buffer yet 00213 // Note: The actual buffer reading is done at the end of this while loop 00214 if ( $len > 0 ) 00215 { 00216 $endPos = false; 00217 $eolPos = 0; 00218 // Look for complete lines and append to $lines 00219 while ( $eolPos !== false and $eolPos < $len ) 00220 { 00221 $eolPos = strpos( $buffer, "\n", $endPos ); 00222 if ( $eolPos !== false ) 00223 { 00224 $line = substr( $buffer, $endPos, $eolPos - $endPos ); 00225 $lines[] = array( 'text' => $line, 00226 'line' => $lineNum ); 00227 ++$lineNum; 00228 $endPos = $eolPos + 1; 00229 } 00230 } 00231 00232 // If we have leftover data place that back in $buffer 00233 if ( $endPos !== false ) 00234 { 00235 $buffer = substr( $buffer, $endPos ); 00236 } 00237 } 00238 00239 // Once we have some lines start parsing them one at a time 00240 foreach ( $lines as $lineData ) 00241 { 00242 $line = $lineData['text']; 00243 $lineOrg = $line; 00244 $linePos = $lineData['line']; 00245 $commentPos = strpos( $line, '#' ); 00246 $origLine = $line; 00247 // Get rid of any comments before we check the line 00248 if ( $commentPos !== false ) 00249 { 00250 $line = substr( $line, 0, $commentPos ); 00251 } 00252 $trimLine = trim( $line ); 00253 // Skip empty lines 00254 if ( strlen( $trimLine ) == 0 ) 00255 continue; 00256 00257 // print( "Line: '$line'\n" ); 00258 00259 $unicodeData = false; 00260 00261 $sourceValue = false; 00262 $sourceEndValue = false; 00263 $destinationValues = false; 00264 $transposeValue = false; 00265 $transposeAdd = true; 00266 $moduloValue = 1; 00267 // source, marker, range_input, range_marker, map_input, transpose_input, replace_input 00268 $state = 'source'; 00269 // map, transpose, replace 00270 $type = false; 00271 00272 $len = strlen( $line ); 00273 if ( preg_match( '#^(.+):[ \t]*$#', $line, $matches ) ) 00274 { 00275 $identifier = $matches[1]; 00276 if ( !preg_match( '#^[a-zA-Z_-][a-zA-Z0-9_-]*$#', $identifier ) ) 00277 { 00278 $this->warning( "Invalid identifier '$identifier', can only contain a-z, a-Z - and _", 00279 array( 'file' => $filename, 'from' => array( $linePos, strlen( $identifier ) ) ) ); 00280 $identifier = false; 00281 continue; 00282 } 00283 // print( "identifier '$identifier'\n" ); 00284 continue; 00285 } 00286 else if ( $identifier === false ) 00287 { 00288 $this->warning( "No identifier defined yet, skipping: '" . $line . "'", 00289 array( 'file' => $filename, 'from' => array( $linePos, 0 ) ) ); 00290 continue; 00291 } 00292 else 00293 { 00294 $pos = 0; 00295 $col = 0; 00296 $failed = false; 00297 while ( $pos < $len ) 00298 { 00299 while ( $pos < $len and 00300 ( $line[$pos] == ' ' or 00301 $line[$pos] == "\t" ) ) 00302 { 00303 ++$pos; 00304 } 00305 if ( $pos >= $len ) 00306 break; 00307 00308 $char = $line[$pos]; 00309 $unicodeData = false; 00310 if ( $char == '"' ) 00311 { 00312 $delimiterPos = $pos; 00313 while ( $delimiterPos < $len ) 00314 { 00315 $delimiterPos = strpos( $line, '"', $delimiterPos + 1 ); 00316 if ( $delimiterPos === false or 00317 $delimiterPos <= $pos + 1 or 00318 $line[$delimiterPos - 1] != "\\" ) 00319 break; 00320 } 00321 if ( $delimiterPos === false ) 00322 { 00323 $this->warning( "No end-quote found for line, skipping: '$line'", 00324 array( 'file' => $filename, 00325 'from' => array( $linePos, $pos ), 00326 'to' => array( $linePos, strlen( $line ) ) ) ); 00327 $pos = $len; 00328 $failed = true; 00329 break; 00330 } 00331 $str = str_replace( array( "\\\"", "\\\\" ), 00332 array( "\"", "\\" ), 00333 substr( $line, $pos + 1, $delimiterPos - $pos - 1 ) ); 00334 // print( "string '$str'\n" ); 00335 $pos = $delimiterPos + 1; 00336 $unicodeData = array( 'value' => $str, 00337 'type' => 'string' ); 00338 } 00339 else if ( $char == 'U' and 00340 $pos + 1 < $len and 00341 $line[$pos + 1] == '+' ) 00342 { 00343 $hexPos = $pos + 2; 00344 if ( $hexPos + 4 > $len ) 00345 { 00346 $col = $hexPos; 00347 $this->warning( "Found U+ value with " . ( 4 - ( $len - $hexPos ) ) . " missing hex numbers", 00348 array( 'file' => $filename, 00349 'from' => array( $linePos, $hexPos ) ) ); 00350 $failed = true; 00351 $pos = $hexPos; 00352 break; 00353 } 00354 $hasHexValues = true; 00355 for ( $offset = 0; $offset < 4; ++$offset ) 00356 { 00357 $hexChar = $line[$hexPos + $offset]; 00358 if ( $hexChar == ' ' or 00359 $hexChar == "\t" ) 00360 { 00361 $col = $hexPos + $offset; 00362 $hasHexValues = false; 00363 $this->warning( "Found U+ value with " . ( 4 - $offset ) . " missing hex numbers", 00364 array( 'file' => $filename, 00365 'from' => array( $linePos, $hexPos ), 00366 'to' => array( $linePos, $hexPos + $offset ) ) ); 00367 $failed = true; 00368 $pos = $hexPos + $offset; 00369 break; 00370 } 00371 if ( strpos( $hexValues, $hexChar ) === false ) 00372 { 00373 $col = $hexPos + $offset; 00374 $hasHexValues = false; 00375 $this->warning( "Found U+ value with invalid hex numbers ($hexChar)", 00376 array( 'file' => $filename, 00377 'from' => array( $linePos, $hexPos ), 00378 'to' => array( $linePos, $hexPos + $offset ) ) ); 00379 $pos = $hexPos + $offset; 00380 $failed = true; 00381 break; 00382 } 00383 } 00384 if ( $failed ) 00385 break; 00386 if ( $hasHexValues ) 00387 { 00388 $unicodeValue = hexdec( substr( $line, $hexPos, 4 ) ); 00389 $unicodeData = array( 'value' => $unicodeValue, 00390 'type' => 'unicode' ); 00391 // print( "unicode U+ '$unicodeValue'\n" ); 00392 } 00393 $pos = $hexPos + 4; 00394 } 00395 else if ( strpos( $hexValues, $char ) !== false and 00396 $pos + 1 < $len and 00397 strpos( $hexValues, $line[$pos + 1] ) !== false ) 00398 { 00399 $hexPos = $pos; 00400 if ( $hexPos + 2 > $len ) 00401 { 00402 $col = $len; 00403 $this->warning( "Found ASCII value with " . ( 2 - ( $len - $hexPos ) ) . " missing hex numbers", 00404 array( 'file' => $filename, 00405 'from' => array( $linePos, $hexPos ) ) ); 00406 $pos = $hexPos; 00407 $failed = true; 00408 break; 00409 } 00410 $hasHexValues = true; 00411 for ( $offset = 0; $offset < 2; ++$offset ) 00412 { 00413 $hexChar = $line[$hexPos + $offset]; 00414 if ( $hexChar == ' ' or 00415 $hexChar == "\t" ) 00416 { 00417 $col = $hexPos + $offset; 00418 $hasHexValues = false; 00419 $this->warning( "Found ASCII value with " . ( 2 - $offset ) . " missing hex numbers", 00420 array( 'file' => $filename, 00421 'from' => array( $linePos, $hexPos ), 00422 'to' => array( $linePos, $hexPos + $offset ) ) ); 00423 $pos = $hexPos + $offset; 00424 $failed = true; 00425 break; 00426 } 00427 if ( strpos( $hexValues, $hexChar ) === false ) 00428 { 00429 $col = $hexPos + $offset; 00430 $hasHexValues = false; 00431 $this->warning( "Found ASCII value with invalid hex numbers ($hexChar)", 00432 array( 'file' => $filename, 00433 'from' => array( $linePos, $hexPos ), 00434 'to' => array( $linePos, $hexPos + $offset ) ) ); 00435 $pos = $hexPos + $offset; 00436 $failed = true; 00437 break; 00438 } 00439 } 00440 if ( $failed ) 00441 break; 00442 if ( $hasHexValues ) 00443 { 00444 $asciiValue = hexdec( substr( $line, $hexPos, 4 ) ); 00445 // print( "unicode ASCII '$asciiValue'\n" ); 00446 $unicodeData = array( 'value' => $asciiValue, 00447 'type' => 'ascii' ); 00448 } 00449 $pos = $hexPos + 2; 00450 } 00451 else if ( substr( $line, $pos, 6 ) == 'remove' ) 00452 { 00453 // print( "remove character\n" ); 00454 $unicodeData = array( 'value' => false, 00455 'type' => 'remove' ); 00456 $pos += 6; 00457 } 00458 else if ( substr( $line, $pos, 4 ) == 'keep' ) 00459 { 00460 // print( "keep character\n" ); 00461 $unicodeData = array( 'value' => true, 00462 'type' => 'keep' ); 00463 $pos += 4; 00464 } 00465 00466 if ( $unicodeData ) 00467 { 00468 // print( "data state: $state\n" ); 00469 // source, marker, range_input, range_marker, map_input, transpose_input, replace_input, transpose_modulo 00470 if ( $state == 'source' ) 00471 { 00472 if ( $unicodeData['type'] == 'string' and 00473 strlen( $unicodeData['value'] ) > 1 ) 00474 { 00475 $this->warning( "Text string with more than one character cannot be used as input value '" . $unicodeData['value'] . "'", 00476 array( 'file' => $filename, 00477 'from' => array( $linePos, $pos ) ) ); 00478 $failed = true; 00479 break; 00480 } 00481 $sourceValue = $this->extractUnicodeValue( $unicodeData ); 00482 $state = 'marker'; 00483 } 00484 else if ( $state == 'marker' ) 00485 { 00486 $this->warning( "Source value not expected, a source value has already been extracted at $line" . "[$pos]", 00487 array( 'file' => $filename, 00488 'from' => array( $linePos, $pos ) ) ); 00489 $failed = true; 00490 break; 00491 } 00492 else if ( $state == 'range_input' ) 00493 { 00494 if ( $unicodeData['type'] == 'string' and 00495 strlen( $unicodeData['value'] ) > 1 ) 00496 { 00497 $this->warning( "Text string with more than one character cannot be used as range end value '" . $unicodeData['value'] . "'", 00498 array( 'file' => $filename, 00499 'from' => array( $linePos, $pos ) ) ); 00500 $failed = true; 00501 break; 00502 } 00503 $sourceEndValue = $this->extractUnicodeValue( $unicodeData ); 00504 $state = 'range_marker_or_modulo'; 00505 } 00506 else if ( $state == 'range_marker_or_modulo' or 00507 $state == 'range_marker' ) 00508 { 00509 $this->warning( "Range value not expected, a range value has already been extracted at $line" . "[$pos]", 00510 array( 'file' => $filename, 00511 'from' => array( $linePos, $pos ) ) ); 00512 $failed = true; 00513 break; 00514 } 00515 else if ( $state == 'map_input' ) 00516 { 00517 if ( !is_array( $destinationValues ) ) 00518 $destinationValues = array(); 00519 $destinationValues = array_merge( $destinationValues, 00520 $this->extractUnicodeValues( $unicodeData ) ); 00521 $type = 'map'; 00522 } 00523 else if ( $state == 'replace_input' ) 00524 { 00525 if ( !is_array( $destinationValues ) ) 00526 $destinationValues = array(); 00527 $destinationValues = array_merge( $destinationValues, 00528 $this->extractUnicodeValues( $unicodeData ) ); 00529 $type = 'replace'; 00530 } 00531 else if ( $state == 'transpose_input' ) 00532 { 00533 if ( $unicodeData['type'] == 'string' and 00534 strlen( $unicodeData['value'] ) > 1 ) 00535 { 00536 $this->warning( "Text string with more than one character cannot be used as transpose value '" . $unicodeData['value'] . "'", 00537 array( 'file' => $filename, 00538 'from' => array( $linePos, $pos ) ) ); 00539 $failed = true; 00540 break; 00541 } 00542 $transposeValue = $this->extractUnicodeValue( $unicodeData ); 00543 $type = 'transpose'; 00544 } 00545 else if ( $state == 'transpose_modulo' ) 00546 { 00547 if ( $unicodeData['type'] == 'string' and 00548 strlen( $unicodeData['value'] ) > 1 ) 00549 { 00550 $this->warning( "Text string with more than one character cannot be used as transpose modulo value '" . $unicodeData['value'] . "'", 00551 array( 'file' => $filename, 00552 'from' => array( $linePos, $pos ) ) ); 00553 $failed = true; 00554 break; 00555 } 00556 $moduloValue = $this->extractUnicodeValue( $unicodeData ); 00557 if ( $moduloValue == 0 ) 00558 { 00559 $this->error( "Modulo value of 0 is not allowed, 1 will be used instead", 00560 array( 'file' => $filename, 00561 'from' => array( $linePos, $pos ) ) ); 00562 // Note: There is another 0 check in generateSimpleMappingTable() 00563 } 00564 // print( "modulo value=$moduloValue\n" ); 00565 $state = 'range_marker'; 00566 } 00567 } 00568 else if ( !$failed ) 00569 { 00570 // print( "command state: $state\n" ); 00571 // source, marker, range_input, range_marker, map_input, transpose_input, replace_input 00572 if ( $state == 'source' ) 00573 { 00574 if ( $char == '=' ) 00575 { 00576 $this->warning( "Cannot use map marker $char without prior character value", 00577 array( 'file' => $filename, 00578 'from' => array( $linePos, $pos ) ) ); 00579 $failed = true; 00580 break; 00581 } 00582 else if ( $char == '+' or 00583 $char == '-' ) 00584 { 00585 $this->warning( "Cannot use range marker $char without prior character value", 00586 array( 'file' => $filename, 00587 'from' => array( $linePos, $pos ) ) ); 00588 $failed = true; 00589 break; 00590 } 00591 else 00592 { 00593 $this->warning( "Unknown character '$char', expecting input value", 00594 array( 'file' => $filename, 00595 'from' => array( $linePos, $pos ) ) ); 00596 $failed = true; 00597 break; 00598 } 00599 } 00600 else if ( $state == 'marker' ) 00601 { 00602 if ( $char == '=' ) 00603 { 00604 $state = 'map_input'; 00605 ++$pos; 00606 } 00607 else if ( $char == '-' ) 00608 { 00609 $state = 'range_input'; 00610 ++$pos; 00611 } 00612 else if ( $char == '+' ) 00613 { 00614 $this->warning( "Cannot use range marker $char without prior character value", 00615 array( 'file' => $filename, 00616 'from' => array( $linePos, $pos ) ) ); 00617 $failed = true; 00618 break; 00619 } 00620 else 00621 { 00622 $this->warning( "Unknown character '$char', expecting marker", 00623 array( 'file' => $filename, 00624 'from' => array( $linePos, $pos ) ) ); 00625 $failed = true; 00626 break; 00627 } 00628 } 00629 else if ( $state == 'range_marker_or_modulo' or 00630 $state == 'range_marker' ) 00631 { 00632 if ( $state == 'range_marker_or_modulo' and 00633 $char == '%' ) 00634 { 00635 // print( "found modulo marker\n" ); 00636 // Look for modulo value 00637 $state = 'transpose_modulo'; 00638 ++$pos; 00639 } 00640 else if ( $char == '=' ) 00641 { 00642 $state = 'replace_input'; 00643 ++$pos; 00644 } 00645 else if ( $char == '-' or 00646 $char == '+' ) 00647 { 00648 $transposeAdd = ( $char == '+' ? true : false ); 00649 $state = 'transpose_input'; 00650 ++$pos; 00651 } 00652 else 00653 { 00654 $this->warning( "Unknown character '$char', expecting range end value", 00655 array( 'file' => $filename, 00656 'from' => array( $linePos, $pos ) ) ); 00657 $failed = true; 00658 break; 00659 } 00660 } 00661 else if ( $state == 'map_input' ) 00662 { 00663 if ( $char == '=' ) 00664 { 00665 $this->warning( "Duplicate mapping marker $char", 00666 array( 'file' => $filename, 00667 'from' => array( $linePos, $pos ) ) ); 00668 $failed = true; 00669 break; 00670 } 00671 else if ( $char == '-' or 00672 $char == '+' ) 00673 { 00674 $this->warning( "Already mapping values, cannot use range/transpose marker $char", 00675 array( 'file' => $filename, 00676 'from' => array( $linePos, $pos ) ) ); 00677 $failed = true; 00678 break; 00679 } 00680 else 00681 { 00682 $this->warning( "Unknown character '$char', expecting output values", 00683 array( 'file' => $filename, 00684 'from' => array( $linePos, $pos ) ) ); 00685 $failed = true; 00686 break; 00687 } 00688 } 00689 else if ( $state == 'transpose_modulo' ) 00690 { 00691 if ( $char == '%' ) 00692 { 00693 $this->warning( "Modulo marker already used, cannot use $char", 00694 array( 'file' => $filename, 00695 'from' => array( $linePos, $pos ) ) ); 00696 $failed = true; 00697 break; 00698 } 00699 else if ( $char == '-' or 00700 $char == '+' ) 00701 { 00702 $this->warning( "Transpose marker $char used, but no modulo value has been found yet", 00703 array( 'file' => $filename, 00704 'from' => array( $linePos, $pos ) ) ); 00705 $failed = true; 00706 break; 00707 } 00708 else 00709 { 00710 $this->warning( "Unknown character '$char', expecting modulo value", 00711 array( 'file' => $filename, 00712 'from' => array( $linePos, $pos ) ) ); 00713 $failed = true; 00714 break; 00715 } 00716 } 00717 else if ( $state == 'transpose_input' ) 00718 { 00719 if ( $char == '=' ) 00720 { 00721 $this->warning( "Already transposing, cannot use mapping marker $char", 00722 array( 'file' => $filename, 00723 'from' => array( $linePos, $pos ) ) ); 00724 $failed = true; 00725 break; 00726 } 00727 else if ( $char == '-' or 00728 $char == '+' ) 00729 { 00730 $this->warning( "Duplicate transpose marker $char", 00731 array( 'file' => $filename, 00732 'from' => array( $linePos, $pos ) ) ); 00733 $failed = true; 00734 break; 00735 } 00736 else 00737 { 00738 $this->warning( "Unknown character '$char', expecting transpose value", 00739 array( 'file' => $filename, 00740 'from' => array( $linePos, $pos ) ) ); 00741 $failed = true; 00742 break; 00743 } 00744 } 00745 else if ( $state == 'replace_input' ) 00746 { 00747 if ( $char == '=' ) 00748 { 00749 $this->warning( "Already replacing, cannot use mapping marker $char", 00750 array( 'file' => $filename, 00751 'from' => array( $linePos, $pos ) ) ); 00752 $failed = true; 00753 break; 00754 } 00755 else if ( $char == '-' or 00756 $char == '+' ) 00757 { 00758 $this->warning( "Already replacing, cannot use transpose marker $char", 00759 array( 'file' => $filename, 00760 'from' => array( $linePos, $pos ) ) ); 00761 $failed = true; 00762 break; 00763 } 00764 else 00765 { 00766 $this->warning( "Unknown character '$char', expecting replace value", 00767 array( 'file' => $filename, 00768 'from' => array( $linePos, $pos ) ) ); 00769 $failed = true; 00770 break; 00771 } 00772 } 00773 } 00774 } 00775 if ( !$failed ) 00776 { 00777 if ( $identifier ) 00778 { 00779 // print( "\nGot type '$type'\n" ); 00780 // if ( is_array( $destinationValues ) ) 00781 // $destinationValues = array_diff( $destinationValues, array( '' ) ); 00782 00783 if ( !isset( $tbl[$identifier] ) ) 00784 $tbl[$identifier] = array(); 00785 00786 if ( $type == 'map' ) 00787 { 00788 // print( "***mapping***:\n" . $sourceValue . ' => ' . implode( ', ', $destinationValues ) . "\n\n" ); 00789 $this->appendDirectMapping( $tbl[$identifier], $identifier, $sourceValue, $destinationValues ); 00790 } 00791 else if ( $type == 'replace' ) 00792 { 00793 // print( "***replacing***:\n" . $sourceValue . ' - ' . $sourceEndValue . ' => ' . implode( ', ', $destinationValues ) . "\n\n" ); 00794 $this->appendReplaceMapping( $tbl[$identifier], $identifier, $sourceValue, $sourceEndValue, $destinationValues ); 00795 } 00796 else if ( $type == 'transpose' ) 00797 { 00798 // print( "***transposing***:\n" . $sourceValue . ' - ' . $sourceEndValue . ' % ' . $moduloValue . ' + ' . $transposeValue . "\n\n" ); 00799 $this->appendTransposeMapping( $tbl[$identifier], $identifier, $sourceValue, $sourceEndValue, $transposeValue, $transposeAdd, $moduloValue ); 00800 } 00801 } 00802 // else 00803 // { 00804 // print( "No identifier found yet, skipping entry!!!!!!!!!!\n" ); 00805 // } 00806 } 00807 else 00808 { 00809 // $this->warning( "Failed adding mapper", 00810 // array( 'file' => $filename, 00811 // 'from' => array( $linePos, $pos ) ) ); 00812 } 00813 } 00814 } 00815 00816 // Here we read more data from the file, appending to 00817 // the $buffer variable 00818 if ( !feof( $fd ) ) 00819 { 00820 $buffer .= fread( $fd, 4096 ); 00821 00822 // Make sure we have Unix endline characters 00823 $buffer = preg_replace( "#(\r\n|\r|\n)#", "\n", $buffer ); 00824 } 00825 ++$i; 00826 } 00827 00828 fclose( $fd ); 00829 00830 $this->TransformationTables = array_merge( $this->TransformationTables, $tbl ); 00831 } 00832 00833 /*! 00834 \private 00835 Appends a mapping from one value to another. 00836 \param $block Current block it is working on 00837 \param $identifier The current identifier it is working on 00838 \param $sourceValue The original value 00839 \param $destinationValues The value it should be mapped to 00840 */ 00841 function appendDirectMapping( &$block, $identifier, $sourceValue, $destinationValues ) 00842 { 00843 $count = count( $block ); 00844 if ( count( $destinationValues ) == 1 ) 00845 $destinationValues = array_pop( $destinationValues ); 00846 if ( isset( $block[$count - 1] ) and 00847 $block[$count - 1][0] == self::TYPE_DIRECT and 00848 $block[$count - 1][2] == $identifier ) 00849 { 00850 $block[$count - 1][1][$sourceValue] = $destinationValues; 00851 } 00852 else 00853 { 00854 $block[] = array( self::TYPE_DIRECT, 00855 array( $sourceValue => $destinationValues ), 00856 $identifier ); 00857 00858 } 00859 } 00860 00861 /*! 00862 \private 00863 Appends a mapping for a range of values into a specific value 00864 \param $block Current block it is working on 00865 \param $identifier The current identifier it is working on 00866 \param $sourceValue The start of the original value 00867 \param $sourceEndValue The ned of the original value 00868 \param $destinationValues The value it should be mapped to 00869 */ 00870 function appendReplaceMapping( &$block, $identifier, $sourceValue, $sourceEndValue, $destinationValues ) 00871 { 00872 $count = count( $block ); 00873 if ( count( $destinationValues ) == 1 ) 00874 $destinationValues = array_pop( $destinationValues ); 00875 if ( isset( $block[$count - 1] ) and 00876 $block[$count - 1][0] == self::TYPE_REPLACE and 00877 $block[$count - 1][2] == $identifier ) 00878 { 00879 $block[$count - 1][1][] = array( $sourceValue, $sourceEndValue, $destinationValues ); 00880 } 00881 else 00882 { 00883 $block[] = array( self::TYPE_REPLACE, 00884 array( array( $sourceValue, $sourceEndValue, $destinationValues ) ), 00885 $identifier ); 00886 00887 } 00888 } 00889 00890 /*! 00891 \private 00892 Appends a mapping for characters by transposing them up or down. 00893 \param $block Current block it is working on 00894 \param $identifier The current identifier it is working on 00895 \param $sourceValue The start of the original value 00896 \param $sourceEndValue The ned of the original value 00897 \param $transposeValue How much to transpose the values 00898 \param $addValue If \c true the $transposeValue is added to the range if not it is subtracted. 00899 */ 00900 function appendTransposeMapping( &$block, $identifier, $sourceValue, $sourceEndValue, $transposeValue, $addValue, $moduloValue ) 00901 { 00902 $count = count( $block ); 00903 if ( isset( $block[$count - 1] ) and 00904 $block[$count - 1][0] == self::TYPE_RANGE and 00905 $block[$count - 1][2] == $identifier ) 00906 { 00907 $block[$count - 1][1][] = array( $sourceValue, $sourceEndValue, $addValue ? $transposeValue : -$transposeValue, $moduloValue ); 00908 } 00909 else 00910 { 00911 $block[] = array( self::TYPE_RANGE, 00912 array( array( $sourceValue, $sourceEndValue, $addValue ? $transposeValue : -$transposeValue, $moduloValue ) ), 00913 $identifier ); 00914 00915 } 00916 } 00917 00918 /*! 00919 \private 00920 \return The first unicod value for the data entry \a $data. 00921 */ 00922 function extractUnicodeValue( $data ) 00923 { 00924 $type = $data['type']; 00925 if ( $type == 'string' ) 00926 { 00927 $list = $this->ISOUnicodeCodec->convertString( $data['value'][0] ); 00928 return $list[0]; 00929 } 00930 else if ( $type == 'ascii' ) 00931 { 00932 return $data['value']; 00933 } 00934 else if ( $type == 'unicode' ) 00935 { 00936 return $data['value']; 00937 } 00938 else if ( $type == 'remove' ) 00939 { 00940 return false; 00941 } 00942 else if ( $type == 'keep' ) 00943 { 00944 return true; 00945 } 00946 return null; 00947 } 00948 00949 /*! 00950 \private 00951 \return The unicode values for the data entry \a $data. 00952 */ 00953 function extractUnicodeValues( $data ) 00954 { 00955 $type = $data['type']; 00956 if ( $type == 'string' ) 00957 { 00958 return $this->ISOUnicodeCodec->convertString( $data['value'] ); 00959 } 00960 else if ( $type == 'ascii' ) 00961 { 00962 return array( $data['value'] ); 00963 } 00964 else if ( $type == 'unicode' ) 00965 { 00966 return array( $data['value'] ); 00967 } 00968 else if ( $type == 'remove' ) 00969 { 00970 return array( false ); 00971 } 00972 else if ( $type == 'keep' ) 00973 { 00974 return array( true ); 00975 } 00976 return array(); 00977 } 00978 00979 /*! 00980 \private 00981 Goes trough all entries in \a $table and if it finds identifier references 00982 it will fetch the table for that identifier and merge in the current one. 00983 \return The expanded table. 00984 */ 00985 function expandInheritance( $table ) 00986 { 00987 $newTable = array(); 00988 foreach ( $table as $tableItem ) 00989 { 00990 if ( is_string( $tableItem ) ) 00991 { 00992 $identifier = $tableItem; 00993 $subTable = $this->mappingTable( $identifier ); 00994 if ( !$subTable ) 00995 { 00996 eZDebug::writeError( "Failed to fetch mapping table for identifier: '$identifier'" ); 00997 } 00998 else 00999 { 01000 $subTable = $this->expandInheritance( $subTable ); 01001 $newTable = array_merge( $newTable, $subTable ); 01002 } 01003 } 01004 else 01005 { 01006 $newTable[] = $tableItem; 01007 } 01008 } 01009 return $newTable; 01010 } 01011 01012 /*! 01013 Turns the character list $list into an array with ordinal values 01014 \param $list Can be on of these types: 01015 - String - each character is turned into an ordinal value 01016 - Numeric - the numeric is used as ordinal value 01017 - Boolean - means no character 01018 - Array - each element is turned into an ordinal value by recursion 01019 */ 01020 function ordinalValues( $table, $list ) 01021 { 01022 $ordinals = array(); 01023 if ( is_string( $list ) ) 01024 { 01025 $len = strlen( $list ); 01026 for ( $offset = 0; $offset < $len; ++$offset ) 01027 { 01028 $ordinals[] = ord( $list[$offset] ); 01029 } 01030 } 01031 else if ( is_numeric( $list ) ) 01032 { 01033 $ordinals[] = $list; 01034 } 01035 else if ( is_array( $list ) ) 01036 { 01037 foreach ( $list as $item ) 01038 { 01039 $ordinals = array_merge( $ordinals, eZCodeMapper::ordinalValues( $table, $item ) ); 01040 } 01041 } 01042 $ordinals = eZCodeMapper::mapOrdinals( $table, $ordinals ); 01043 return $ordinals; 01044 } 01045 01046 /*! 01047 Goes trough each ordinal in \a $ordinals and sees if there is mapping for it. 01048 If it is the mapping is applied and used as the new ordinal, if the mapping refers to 01049 an array it will be mapped recursively. 01050 */ 01051 function mapOrdinals( $table, $ordinals ) 01052 { 01053 $mappedOrdinals = array(); 01054 foreach ( $ordinals as $ordinal ) 01055 { 01056 while ( !is_array( $ordinal ) and isset( $table[$ordinal] ) ) 01057 { 01058 $ordinal = $table[$ordinal]; 01059 if ( is_array( $ordinal ) ) 01060 { 01061 $ordinal = eZCodeMapper::mapOrdinals( $table, $ordinal ); 01062 } 01063 } 01064 if ( is_array( $ordinal ) ) 01065 $mappedOrdinals = array_merge( $mappedOrdinals, $ordinal ); 01066 else 01067 $mappedOrdinals[] = $ordinal; 01068 } 01069 return $mappedOrdinals; 01070 } 01071 01072 /*! 01073 Goes trough all to codes in the mapping table \a $unicodeMap and maps 01074 those that match \a $fromCode into \a $toCode. 01075 01076 \return \a $unicodeMap 01077 */ 01078 protected function mapExistingCodes( $unicodeMap, $fromCode, $toCode ) 01079 { 01080 foreach ( $unicodeMap as $from => $to ) 01081 { 01082 if ( is_array( $to ) ) 01083 { 01084 $newTo = array(); 01085 foreach ( $to as $ordinal ) 01086 { 01087 if ( $ordinal == $fromCode ) 01088 { 01089 $newTo = array_merge( $newTo, array( $toCode ) ); 01090 } 01091 else 01092 { 01093 $newTo[] = $ordinal; 01094 } 01095 } 01096 $unicodeMap[$from] = $newTo; 01097 } 01098 else if ( $to == $fromCode ) 01099 { 01100 $unicodeMap[$from] = $toCode; 01101 } 01102 } 01103 return $unicodeMap; 01104 } 01105 01106 /*! 01107 Goes trough the mapping rules in the table \a $table and generates a simple 01108 mapping table which maps from one Unicode value to another (or array of values). 01109 01110 The generation uses backward and forward propagation of the defined mappings 01111 to get the proper end result of a given value. 01112 01113 \note This method can take a while if lots of rules are used 01114 */ 01115 function generateSimpleMappingTable( $table, $allowedRanges ) 01116 { 01117 if ( !is_array( $table ) ) 01118 return false; 01119 $unicodeMap = array(); 01120 foreach ( $table as $tableItem ) 01121 { 01122 $type = $tableItem[0]; 01123 $item = $tableItem[1]; 01124 if ( isset( $tableItem[2] ) ) 01125 { 01126 $identifier = $tableItem[2]; 01127 // print( "identifier: $identifier\n" ); 01128 } 01129 if ( $type == self::TYPE_DIRECT ) 01130 { 01131 foreach ( $item as $fromCode => $toCode ) 01132 { 01133 // print( "from: $fromCode, to: $toCode\n" ); 01134 // if ( $fromCode == 1026 ) 01135 // { 01136 // print( "<pre>oldcode<br/>" ); var_dump( $toCode ); print( "</pre>" ); 01137 // } 01138 $toCode = eZCodeMapper::ordinalValues( $unicodeMap, $toCode ); 01139 // if ( $fromCode == 1026 ) 01140 // { 01141 // print( "<pre>newcode<br/>" ); var_dump( $toCode ); print( "</pre>" ); 01142 // } 01143 if ( count( $allowedRanges ) == 0 ) 01144 { 01145 if ( count( $toCode ) == 1 ) 01146 $toCode = $toCode[0]; 01147 // If the mapping already exists we skip it 01148 if ( isset( $unicodeMap[$fromCode] ) ) 01149 continue; 01150 01151 $unicodeMap[$fromCode] = $toCode; 01152 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $fromCode, $toCode ); 01153 } 01154 else 01155 { 01156 $allowed = false; 01157 foreach ( $allowedRanges as $allowedRange ) 01158 { 01159 if ( $fromCode >= $allowedRange[0] and 01160 $fromCode <= $allowedRange[1] ) 01161 { 01162 $allowed = true; 01163 break; 01164 } 01165 } 01166 if ( !$allowed ) 01167 continue; 01168 01169 $toCodeList = $toCode; 01170 $newToCodeList = array(); 01171 foreach ( $toCodeList as $toCode ) 01172 { 01173 if ( is_bool( $toCode ) ) 01174 { 01175 $newToCodeList[] = $toCode; 01176 continue; 01177 } 01178 foreach ( $allowedRanges as $allowedRange ) 01179 { 01180 if ( $toCode >= $allowedRange[0] and 01181 $toCode <= $allowedRange[1] ) 01182 { 01183 break; 01184 } 01185 } 01186 if ( $allowed ) 01187 { 01188 $newToCodeList[] = $toCode; 01189 } 01190 } 01191 $toCode = $newToCodeList; 01192 if ( count( $toCode ) > 0 ) 01193 { 01194 if ( count( $toCode ) == 1 ) 01195 $toCode = $toCode[0]; 01196 01197 // If the mapping already exists we skip it 01198 if ( isset( $unicodeMap[$fromCode] ) ) 01199 continue; 01200 01201 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $fromCode, $toCode ); 01202 01203 $unicodeMap[$fromCode] = $toCode; 01204 } 01205 } 01206 } 01207 } 01208 else if ( $type == self::TYPE_RANGE ) 01209 { 01210 foreach ( $item as $rangeItem ) 01211 { 01212 $start = $rangeItem[0]; 01213 $stop = $rangeItem[1]; 01214 if ( $start > $stop ) 01215 { 01216 $tmp = $stop; 01217 $stop = $start; 01218 $start = $tmp; 01219 } 01220 $add = $rangeItem[2]; 01221 $modulo = $rangeItem[3]; 01222 // Sanity-check, to avoid infinite loops 01223 if ( $modulo == 0 ) 01224 $modulo = 1; 01225 for ( $i = $start; $i <= $stop; $i += $modulo ) 01226 { 01227 if ( count( $allowedRanges ) == 0 ) 01228 { 01229 $allowed = true; 01230 } 01231 else 01232 { 01233 $allowed = false; 01234 foreach ( $allowedRanges as $allowedRange ) 01235 { 01236 if ( $i >= $allowedRange[0] and 01237 $i <= $allowedRange[1] ) 01238 { 01239 $allowed = true; 01240 break; 01241 } 01242 } 01243 if ( !$allowed ) 01244 continue; 01245 } 01246 01247 $replace = $i + $add; 01248 $replace = eZCodeMapper::ordinalValues( $unicodeMap, $replace ); 01249 if ( count( $allowedRanges ) == 0 ) 01250 { 01251 if ( count( $replace ) == 0 ) 01252 $replace = false; 01253 else if ( count( $replace ) == 1 ) 01254 $replace = $replace[0]; 01255 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace ); 01256 01257 // If the mapping already exists we skip it 01258 if ( isset( $unicodeMap[$i] ) ) 01259 continue; 01260 01261 $unicodeMap[$i] = $replace; 01262 } 01263 else 01264 { 01265 $newReplace = array(); 01266 foreach ( $allowedRanges as $allowedRange ) 01267 { 01268 foreach ( $replace as $replaceOrdinal ) 01269 { 01270 if ( $replaceOrdinal >= $allowedRange[0] and 01271 $replaceOrdinal <= $allowedRange[1] ) 01272 { 01273 $newReplace[] = $replaceOrdinal; 01274 } 01275 } 01276 } 01277 if ( count( $newReplace ) == 0 ) 01278 $replace = false; 01279 else if ( count( $newReplace ) == 1 ) 01280 $replace = $newReplace[0]; 01281 else 01282 $replace = $newReplace; 01283 01284 // If the mapping already exists we skip it 01285 if ( isset( $unicodeMap[$i] ) ) 01286 continue; 01287 01288 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace ); 01289 $unicodeMap[$i] = $replace; 01290 } 01291 } 01292 } 01293 } 01294 else if ( $type == self::TYPE_REPLACE ) 01295 { 01296 foreach ( $item as $rangeItem ) 01297 { 01298 $start = $rangeItem[0]; 01299 $stop = $rangeItem[1]; 01300 if ( $start > $stop ) 01301 { 01302 $tmp = $stop; 01303 $stop = $start; 01304 $start = $tmp; 01305 } 01306 $replace = $rangeItem[2]; 01307 $replace = eZCodeMapper::ordinalValues( $unicodeMap, $replace ); 01308 if ( count( $allowedRanges ) == 0 ) 01309 { 01310 if ( count( $replace ) == 0 ) 01311 $replace = false; 01312 else if ( count( $replace ) == 1 ) 01313 $replace = $replace[0]; 01314 for ( $i = $start; $i <= $stop; ++$i ) 01315 { 01316 // If the mapping already exists we skip it 01317 if ( isset( $unicodeMap[$i] ) ) 01318 continue; 01319 01320 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace ); 01321 $unicodeMap[$i] = $replace; 01322 } 01323 } 01324 else 01325 { 01326 $newReplace = array(); 01327 foreach ( $allowedRanges as $allowedRange ) 01328 { 01329 foreach ( $replace as $replaceOrdinal ) 01330 { 01331 if ( $replaceOrdinal >= $allowedRange[0] and 01332 $replaceOrdinal <= $allowedRange[1] ) 01333 { 01334 $newReplace[] = $replaceOrdinal; 01335 } 01336 } 01337 } 01338 if ( count( $newReplace ) == 0 ) 01339 $replace = false; 01340 else if ( count( $newReplace ) == 1 ) 01341 $replace = $newReplace[0]; 01342 else 01343 $replace = $newReplace; 01344 for ( $i = $start; $i <= $stop; ++$i ) 01345 { 01346 $allowed = false; 01347 foreach ( $allowedRanges as $allowedRange ) 01348 { 01349 if ( $i >= $allowedRange[0] and 01350 $i <= $allowedRange[1] ) 01351 { 01352 $allowed = true; 01353 break; 01354 } 01355 } 01356 if ( $allowed ) 01357 { 01358 // If the mapping already exists we skip it 01359 if ( isset( $unicodeMap[$i] ) ) 01360 continue; 01361 01362 $unicodeMap = eZCodeMapper::mapExistingCodes( $unicodeMap, $i, $replace ); 01363 $unicodeMap[$i] = $replace; 01364 } 01365 } 01366 } 01367 } 01368 } 01369 } 01370 return $unicodeMap; 01371 } 01372 01373 /*! 01374 Generates a unicode mapping table for idenfier \a $idenfier. 01375 01376 \param $identifier Is either a single identifier string or a 01377 an array with identifiers. 01378 \return The unicode mapping table for all defined identifiers 01379 */ 01380 function generateMappingCode( $identifier ) 01381 { 01382 if ( !is_array( $identifier ) ) 01383 $identifier = array( $identifier ); 01384 $table = $this->expandInheritance( $identifier ); 01385 01386 // We allow all characters for now 01387 $allowedRanges = array(); 01388 $simpleTable = $this->generateSimpleMappingTable( $table, $allowedRanges ); 01389 ksort( $simpleTable ); 01390 return $simpleTable; 01391 } 01392 01393 /*! 01394 Generates a mapping table for the character set $charset. 01395 This will mapping table will only work for that character set but will be much faster 01396 and be fed directly to the strtr() PHP function. 01397 \return the table or \c false if something failed. 01398 */ 01399 function generateCharsetMappingTable( $unicodeTable, $charset ) 01400 { 01401 $codec = eZTextCodec::instance( 'unicode', $charset ); 01402 if ( !$codec ) 01403 { 01404 eZDebug::writeError( "Failed to create textcodec for charset '$charset'" ); 01405 return false; 01406 } 01407 01408 $charsetTable = array(); 01409 foreach ( $unicodeTable as $match => $replacement ) 01410 { 01411 $matchLocal = $codec->convertString( array( $match ) ); 01412 if ( is_array( $replacement ) ) 01413 { 01414 $replacementLocal = $codec->convertString( $replacement ); 01415 } 01416 else 01417 { 01418 $replacementLocal = $codec->convertString( array( $replacement ) ); 01419 } 01420 $charsetTable[$matchLocal] = $replacementLocal; 01421 } 01422 01423 // Make sure longer string entries are placed before the shorter ones 01424 // This is very important when working with utf8 which have 01425 // variable length for characters 01426 krsort( $charsetTable ); 01427 return $charsetTable; 01428 } 01429 01430 /*! 01431 Decodes a command into transformation rules. 01432 \param $name Name of the command 01433 \param $parameters Array of parameters for the command 01434 \return An array with transformation rules. 01435 */ 01436 function decodeCommand( $name, $parameters ) 01437 { 01438 $names = $this->ruleNames(); 01439 $rules = array(); 01440 switch ( $name ) 01441 { 01442 // Special code handlers 01443 case 'url_cleanup_iri': 01444 case 'url_cleanup': 01445 case 'url_cleanup_compat': 01446 case 'identifier_cleanup': 01447 { 01448 } break; 01449 01450 case 'normalize': 01451 case 'search_normalize': 01452 case 'decompose': 01453 case 'diacritical': 01454 case 'lowercase': 01455 case 'uppercase': 01456 case 'search_cleanup': 01457 { 01458 if ( count( $parameters ) == 0 ) 01459 { 01460 // Include all normalize rules 01461 foreach ( $names as $rule ) 01462 { 01463 if ( preg_match( '#_'. $name . '$#', $rule ) ) 01464 $rules[] = $rule; 01465 } 01466 } 01467 else 01468 { 01469 foreach ( $parameters as $parameter ) 01470 { 01471 $rule = $parameter . '_' . $name; 01472 if ( in_array( $rule, $names ) ) 01473 $rules[] = $rule; 01474 } 01475 } 01476 } break; 01477 01478 case 'transform': 01479 case 'transliterate': 01480 { 01481 $dividers = array( 'transform' => '_to_', 01482 'transliterate' => '_transliterate_' ); 01483 $divider = $dividers[$name]; 01484 if ( count( $parameters ) == 0 ) 01485 { 01486 // Include all transformation rules 01487 foreach ( $names as $rule ) 01488 { 01489 if ( preg_match( '#^[a-zA-Z][a-zA-Z0-9-]+'. $divider . '[a-zA-Z][a-zA-Z0-9-]+$#', $rule ) ) 01490 $rules[] = $rule; 01491 } 01492 } 01493 else if ( count( $parameters ) == 2 ) 01494 { 01495 $rule = $parameters[0] . $divider . $parameters[1]; 01496 if ( in_array( $rule, $names ) ) 01497 $rules[] = $rule; 01498 } 01499 } break; 01500 01501 default: 01502 { 01503 $ini = eZINI::instance( 'transform.ini' ); 01504 $commands = $ini->variable( 'Extensions', 'Commands' ); 01505 if ( isset( $commands[$name] ) ) 01506 { 01507 break; 01508 } 01509 eZDebug::writeError( "Unknown command '$name'", __METHOD__ ); 01510 } break; 01511 } 01512 return $rules; 01513 } 01514 01515 /*! 01516 Generates PHP code for the command \a $command. 01517 \param $charsetName The name of the charset the text will be in, 01518 this can be used to generate different code for different charsets. 01519 \return A string containing PHP code or \c false if not supported. 01520 */ 01521 function generateCommandCode( $command, $charsetName ) 01522 { 01523 if ( $command['command'] == 'url_cleanup_iri' ) 01524 { 01525 $charsetNameTxt = var_export( $charsetName, true ); 01526 $code = "\$text = eZCharTransform::commandUrlCleanupIRI( \$text, $charsetNameTxt );\n"; 01527 return $code; 01528 } 01529 else if ( $command['command'] == 'url_cleanup' ) 01530 { 01531 $charsetNameTxt = var_export( $charsetName, true ); 01532 $code = "\$text = eZCharTransform::commandUrlCleanup( \$text, $charsetNameTxt );\n"; 01533 return $code; 01534 } 01535 else if ( $command['command'] == 'url_cleanup_compat' ) 01536 { 01537 $charsetNameTxt = var_export( $charsetName, true ); 01538 $code = "\$text = eZCharTransform::commandUrlCleanupCompat( \$text, $charsetNameTxt );\n"; 01539 return $code; 01540 } 01541 else if ( $command['command'] == 'identifier_cleanup' ) 01542 { 01543 $code = ( "\$text = strtolower( \$text );\n" . 01544 "\$text = preg_replace( array( \"#[^a-z0-9_ ]#\",\n" . 01545 " \"/ /\",\n" . 01546 " \"/__+/\",\n" . 01547 " \"/^_|_$/\" ),\n" . 01548 " array( \" \",\n" . 01549 " \"_\",\n" . 01550 " \"_\",\n" . 01551 " \"\" ),\n" . 01552 " \$text );\n" ); 01553 return $code; 01554 } 01555 else if ( $command['command'] == 'search_cleanup' ) 01556 { 01557 $code = ''; 01558 $nonCJKCharsets = $this->nonCJKCharsets(); 01559 if ( !in_array( $charsetName, $nonCJKCharsets ) ) 01560 { 01561 $code .= ( '// add N-Gram(N=2) chinese / japanese / korean multibyte characters' . "\n" . 01562 '$codec = eZTextCodec::instance( false, \'unicode\' );' . "\n" . 01563 "\n" . 01564 '$unicodeValueArray = $codec->convertString( $text );' . "\n" . 01565 "\n" . 01566 '$normalizedTextArray = array();' . "\n" . 01567 '$bFlag = false;' . "\n" . 01568 'foreach ( array_keys( $unicodeValueArray ) as $valueKey )' . "\n" . 01569 '{' . "\n" . 01570 ' // Check for word characters that should be broken up for search' . "\n" . 01571 ' if ( ( $unicodeValueArray[$valueKey] >= 12289 and' . "\n" . 01572 ' $unicodeValueArray[$valueKey] <= 12542 ) or' . "\n" . 01573 ' ( $unicodeValueArray[$valueKey] >= 13312 and' . "\n" . 01574 ' $unicodeValueArray[$valueKey] <= 40863 ) or' . "\n" . 01575 ' ( $unicodeValueArray[$valueKey] >= 44032 and' . "\n" . 01576 ' $unicodeValueArray[$valueKey] <= 55203 ) )' . "\n" . 01577 ' {' . "\n" . 01578 ' if ( $bFlag )' . "\n" . 01579 ' {' . "\n" . 01580 ' $normalizedTextArray[] = $unicodeValueArray[$valueKey];' . "\n" . 01581 ' }' . "\n" . 01582 ' $normalizedTextArray[] = 32; // A space' . "\n" . 01583 ' $normalizedTextArray[] = $unicodeValueArray[$valueKey];' . "\n" . 01584 ' $bFlag = true;' . "\n" . 01585 ' }' . "\n" . 01586 ' else' . "\n" . 01587 ' {' . "\n" . 01588 ' if ( $bFlag )' . "\n" . 01589 ' {' . "\n" . 01590 ' $normalizedTextArray[] = 32; // A space' . "\n" . 01591 ' }' . "\n" . 01592 ' $normalizedTextArray[] = $unicodeValueArray[$valueKey];' . "\n" . 01593 ' $bFlag = false;' . "\n" . 01594 ' }' . "\n" . 01595 '}' . "\n" . 01596 'if ( $bFlag )' . "\n" . 01597 '{' . "\n" . 01598 ' $normalizedTextArray[count($normalizedTextArray)-1]=32;' . "\n" . 01599 '}' . "\n" . 01600 '$revCodec = eZTextCodec::instance( \'unicode\', false ); // false means use internal charset' . "\n" . 01601 '$text = $revCodec->convertString( $normalizedTextArray );' . "\n" ); 01602 } 01603 $code .= ( '$text = preg_replace( array( "#(\.){2,}#",' . "\n" . 01604 ' "#^\.#",' . "\n" . 01605 ' "#\s\.#",' . "\n" . 01606 ' "#\.\s#",' . "\n" . 01607 ' "#\.$#",' . "\n" . 01608 ' "#([^0-9])%#" ),' . "\n" . 01609 ' array( " ",' . "\n" . 01610 ' " ",' . "\n" . 01611 ' " ",' . "\n" . 01612 ' " ",' . "\n" . 01613 ' " ",' . "\n" . 01614 ' " " ),' . "\n" . 01615 ' $text );' . "\n" . 01616 '$ini = eZINI::instance();' . "\n" . 01617 'if ( $ini->variable( \'SearchSettings\', \'EnableWildcard\' ) != \'true\' )' . "\n" . 01618 '{' . "\n" . 01619 ' $text = str_replace( "*", " ", $text );' . "\n" . 01620 '}' . "\n" . 01621 '$charset = eZTextCodec::internalCharset();' . "\n" . 01622 '$hasUTF8 = ( $charset == "utf-8" );' . "\n" . 01623 "\n" . 01624 'if ( $hasUTF8 )' . "\n" . 01625 '{' . "\n" . 01626 ' $text = preg_replace( "#(\s+)#u", " ", $text );' . "\n" . 01627 '}' . "\n" . 01628 'else' . "\n" . 01629 '{' . "\n" . 01630 ' $text = preg_replace( "#(\s+)#", " ", $text );' . "\n" . 01631 '}' ); 01632 01633 return $code; 01634 } 01635 else 01636 { 01637 $ini = eZINI::instance( 'transform.ini' ); 01638 $commands = $ini->variable( 'Extensions', 'Commands' ); 01639 if ( isset( $commands[$command['command']] ) ) 01640 { 01641 list( $path, $className ) = explode( ':', $commands[$command['command']], 2 ); 01642 if ( file_exists( $path ) ) 01643 { 01644 $charsetNameTxt = var_export( $charsetName, true ); 01645 $commandTxt = var_export( $command['command'], true ); 01646 $pathTxt = var_export( $path, true ); 01647 $code = "include_once( $pathTxt );\n\$text = $className::executeCommand( \$text, $commandTxt, $charsetNameTxt );\n"; 01648 return $code; 01649 } 01650 else 01651 { 01652 eZDebug::writeError( "Could not locate include file '$path' for transformation '" . $command['command'] . "'" ); 01653 } 01654 } 01655 } 01656 return false; 01657 } 01658 01659 /*! 01660 Executes custom PHP code for the command \a $command. 01661 \param $charsetName The name of the charset the text will be in, 01662 this can be used to execute different code for different charsets. 01663 \return \c true if the command is supported, \c false otherwise. 01664 */ 01665 function executeCommandCode( &$text, $command, $charsetName ) 01666 { 01667 if ( $command['command'] == 'url_cleanup_iri' ) 01668 { 01669 $text = eZCharTransform::commandUrlCleanupIRI( $text, $charsetName ); 01670 return true; 01671 } 01672 else if ( $command['command'] == 'url_cleanup' ) 01673 { 01674 $text = eZCharTransform::commandUrlCleanup( $text, $charsetName ); 01675 return true; 01676 } 01677 else if ( $command['command'] == 'url_cleanup_compat' ) 01678 { 01679 $text = eZCharTransform::commandUrlCleanupCompat( $text, $charsetName ); 01680 return true; 01681 } 01682 else if ( $command['command'] == 'identifier_cleanup' ) 01683 { 01684 $text = strtolower( $text ); 01685 $text = preg_replace( array( "#[^a-z0-9_ ]#", 01686 "/ /", 01687 "/__+/", 01688 "/^_|_$/" ), 01689 array( " ", 01690 "_", 01691 "_", 01692 "" ), 01693 $text ); 01694 return true; 01695 } 01696 else if ( $command['command'] == 'search_cleanup' ) 01697 { 01698 $nonCJKCharsets = $this->nonCJKCharsets(); 01699 if ( !in_array( $charsetName, $nonCJKCharsets ) ) 01700 { 01701 // 4 Add spaces after chinese / japanese / korean multibyte characters 01702 $codec = eZTextCodec::instance( false, 'unicode' ); 01703 01704 $unicodeValueArray = $codec->convertString( $text ); 01705 01706 $normalizedTextArray = array(); 01707 $bFlag = false; 01708 foreach ( array_keys( $unicodeValueArray ) as $valueKey ) 01709 { 01710 // Check for word characters that should be broken up for search 01711 if ( ( $unicodeValueArray[$valueKey] >= 12289 and 01712 $unicodeValueArray[$valueKey] <= 12542 ) or 01713 ( $unicodeValueArray[$valueKey] >= 13312 and 01714 $unicodeValueArray[$valueKey] <= 40863 ) or 01715 ( $unicodeValueArray[$valueKey] >= 44032 and 01716 $unicodeValueArray[$valueKey] <= 55203 ) ) 01717 { 01718 if ( $bFlag ) 01719 { 01720 $normalizedTextArray[] = $unicodeValueArray[$valueKey]; 01721 } 01722 $normalizedTextArray[] = 32; // A space 01723 $normalizedTextArray[] = $unicodeValueArray[$valueKey]; 01724 $bFlag = true; 01725 } 01726 else 01727 { 01728 if ( $bFlag ) 01729 { 01730 $normalizedTextArray[] = 32; // A space 01731 } 01732 $normalizedTextArray[] = $unicodeValueArray[$valueKey]; 01733 $bFlag = false; 01734 } 01735 } 01736 01737 if ( $bFlag ) 01738 { 01739 $normalizedTextArray[ count( $normalizedTextArray ) - 1 ] = 32; 01740 } 01741 01742 $revCodec = eZTextCodec::instance( 'unicode', false ); // false means use internal charset 01743 $text = $revCodec->convertString( $normalizedTextArray ); 01744 } 01745 01746 // Make sure dots inside words/numbers are kept, the rest is turned into space 01747 $text = preg_replace( array( "#(\.){2,}#", 01748 "#^\.#", 01749 "#\s\.#", 01750 "#\.\s#", 01751 "#\.$#", 01752 "#([^0-9])%#" ), // Keep only % after a number 01753 array( " ", 01754 " ", 01755 " ", 01756 " ", 01757 " ", 01758 "$1 " ), 01759 $text ); 01760 $ini = eZINI::instance(); 01761 if ( $ini->variable( 'SearchSettings', 'EnableWildcard' ) != 'true' ) 01762 { 01763 $text = str_replace( "*", " ", $text ); 01764 } 01765 $charset = eZTextCodec::internalCharset(); 01766 $hasUTF8 = ( $charset == "utf-8" ); 01767 01768 if ( $hasUTF8 ) 01769 { 01770 $text = preg_replace( "#(\s+)#u", " ", $text ); 01771 } 01772 else 01773 { 01774 $text = preg_replace( "#(\s+)#", " ", $text ); 01775 } 01776 01777 return true; 01778 } 01779 else 01780 { 01781 $ini = eZINI::instance( 'transform.ini' ); 01782 $commands = $ini->variable( 'Extensions', 'Commands' ); 01783 if ( isset( $commands[$command['command']] ) ) 01784 { 01785 list( $path, $className ) = explode( ':', $commands[$command['command']], 2 ); 01786 if ( file_exists( $path ) ) 01787 { 01788 include_once( $path ); 01789 $text = call_user_func_array( array( $className, 'executeCommand' ), 01790 array( $text, $command['command'], $charsetName ) ); 01791 return true; 01792 } 01793 else 01794 { 01795 eZDebug::writeError( "Could not locate include file '$path' for transformation '" . $command['command'] . "'" ); 01796 } 01797 } 01798 } 01799 return false; 01800 } 01801 01802 /*! 01803 \return An array with charsets that are certain to not contain CJK characters. 01804 */ 01805 function nonCJKCharsets() 01806 { 01807 return array( 'adobe-standard-encoding', 01808 'cp437', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857', 01809 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 01810 'cp869', 'cp874', 01811 'dec-mcs', 'hp-roman8', 01812 'iso-8859-1', 'iso-8859-2', 'iso-8859-3', 'iso-8859-4', 'iso-8859-5', 01813 'iso-8859-6', 'iso-8859-7', 'iso-8859-8', 'iso-8859-9', 'iso-8859-10', 01814 'iso-8859-11', 'iso-8859-13', 'iso-8859-14', 'iso-8859-15', 01815 'koi8-r', 'koi8-u', 'macintosh', 'next', 'us-ascii', 01816 'windows-1250', 'windows-1251', 'windows-1252', 'windows-1253', 01817 'windows-1254', 'windows-1255', 'windows-1256', 'windows-1257', 01818 'windows-1258' ); 01819 } 01820 01821 /// \privatesection 01822 public $TransformationTables; 01823 public $TransformationFiles; 01824 public $ISOUnicodeCodec; 01825 } 01826 01827 ?>