|
eZ Publish
[4.0]
|
00001 <?php 00002 // 00003 // Definition of eZUTF8Codec class 00004 // 00005 // ## BEGIN COPYRIGHT, LICENSE AND WARRANTY NOTICE ## 00006 // SOFTWARE NAME: eZ Publish 00007 // SOFTWARE RELEASE: 4.0.x 00008 // COPYRIGHT NOTICE: Copyright (C) 1999-2008 eZ Systems AS 00009 // SOFTWARE LICENSE: GNU General Public License v2.0 00010 // NOTICE: > 00011 // This program is free software; you can redistribute it and/or 00012 // modify it under the terms of version 2.0 of the GNU General 00013 // Public License as published by the Free Software Foundation. 00014 // 00015 // This program is distributed in the hope that it will be useful, 00016 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00018 // GNU General Public License for more details. 00019 // 00020 // You should have received a copy of version 2.0 of the GNU General 00021 // Public License along with this program; if not, write to the Free 00022 // Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00023 // MA 02110-1301, USA. 00024 // 00025 // 00026 // ## END COPYRIGHT, LICENSE AND WARRANTY NOTICE ## 00027 // 00028 00029 /*! 00030 \class eZUTF8Codec ezutf8codec.php 00031 \ingroup eZI18N 00032 \brief Converter for utf8 and 32bit unicode 00033 00034 Allows for conversion from utf8 charactes and to 32bit unicode values, 00035 and vice versa. 00036 00037 */ 00038 00039 class eZUTF8Codec 00040 { 00041 /*! 00042 Initializes utf8 codec. 00043 */ 00044 function eZUTF8Codec() 00045 { 00046 } 00047 00048 /*! 00049 Converts an UTF8 string into Unicode values and returns an array with the values. 00050 */ 00051 static function convertStringToUnicode( $str ) 00052 { 00053 $unicodeValues = array(); 00054 $strLen = strlen( $str ); 00055 for ( $offset = 0; $offset < $strLen; ) 00056 { 00057 $charLen = 1; 00058 $unicodeValue = eZUTF8Codec::fromUTF8( $str, $offset, $charLen ); 00059 if ( $unicodeValue !== false ) 00060 $unicodeValues[] = $unicodeValue; 00061 $offset += $charLen; 00062 } 00063 return $unicodeValues; 00064 } 00065 00066 /*! 00067 Converts an array with Unicode values into an UTF8 string and returns it. 00068 */ 00069 static function convertUnicodeToString( $unicodeValues ) 00070 { 00071 if ( !is_array( $unicodeValues ) ) 00072 return false; 00073 $text = ''; 00074 foreach ( $unicodeValues as $unicodeValue ) 00075 { 00076 $utf8Char = eZUTF8Codec::toUTF8( $unicodeValue ); 00077 $text .= $utf8Char; 00078 } 00079 return $text; 00080 } 00081 00082 /*! 00083 \static 00084 Converts the 32 bit integer $char_code to a utf8 string representing the Unicode character. 00085 */ 00086 static function toUTF8( $char_code ) 00087 { 00088 switch ( $char_code ) 00089 { 00090 case 0: 00091 $char = chr( 0 ); 00092 case !($char_code & 0xffffff80): // 7 bit 00093 $char = chr( $char_code ); 00094 break; 00095 case !($char_code & 0xfffff800): // 11 bit 00096 $char = ( chr(0xc0 | (($char_code >> 6) & 0x1f)) . 00097 chr(0x80 | ($char_code & 0x3f)) ); 00098 break; 00099 case !($char_code & 0xffff0000): // 16 bit 00100 $char = ( chr(0xe0 | (($char_code >> 12) & 0x0f)) . 00101 chr(0x80 | (($char_code >> 6) & 0x3f)) . 00102 chr(0x80 | ($char_code & 0x3f)) ); 00103 break; 00104 case !($char_code & 0xffe00000): // 21 bit 00105 $char = ( chr(0xf0 | (($char_code >> 18) & 0x07)) . 00106 chr(0x80 | (($char_code >> 12) & 0x3f)) . 00107 chr(0x80 | (($char_code >> 6) & 0x3f)) . 00108 chr(0x80 | ($char_code & 0x3f)) ); 00109 break; 00110 case !($char_code & 0xfc000000): // 26 bit 00111 $char = ( chr(0xf8 | (($char_code >> 24) & 0x03)) . 00112 chr(0x80 | (($char_code >> 18) & 0x3f)) . 00113 chr(0x80 | (($char_code >> 12) & 0x3f)) . 00114 chr(0x80 | (($char_code >> 6) & 0x3f)) . 00115 chr(0x80 | ($char_code & 0x3f)) ); 00116 default: // 31 bit 00117 $char = ( chr(0xfc | (($char_code >> 30) & 0x01)) . 00118 chr(0x80 | (($char_code >> 24) & 0x3f)) . 00119 chr(0x80 | (($char_code >> 18) & 0x3f)) . 00120 chr(0x80 | (($char_code >> 12) & 0x3f)) . 00121 chr(0x80 | (($char_code >> 6) & 0x3f)) . 00122 chr(0x80 | ($char_code & 0x3f)) ); 00123 } 00124 return $char; 00125 } 00126 00127 /*! 00128 \static 00129 Converts the first utf8 char in the string $multi_char to a 32 bit integer. 00130 $offs is the offset in the string. 00131 $len will contain the length of utf8 char in the string which can be used to 00132 find the next char. 00133 */ 00134 static function fromUtf8( $multi_char, $offs, &$len ) 00135 { 00136 $char_code = false; 00137 if ( ( ord( $multi_char[$offs + 0] ) & 0x80 ) == 0x00 ) // 7 bit, 1 char 00138 { 00139 $char_code = ord( $multi_char[$offs + 0] ); 00140 $len = 1; 00141 } 00142 else if ( ( ord( $multi_char[$offs + 0] ) & 0xe0 ) == 0xc0 ) // 11 bit, 2 chars 00143 { 00144 $len = 2; 00145 if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 ) 00146 return $char_code; 00147 $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x1f ) << 6) + 00148 (( ord( $multi_char[$offs + 1] ) & 0x3f )) ); 00149 if ( $char_code < 128 ) // Illegal multibyte, should use less than 2 chars 00150 { 00151 $char_code == false; 00152 } 00153 } 00154 else if ( ( ord( $multi_char[$offs + 0] ) & 0xf0 ) == 0xe0 ) // 16 bit, 3 chars 00155 { 00156 $len = 3; 00157 if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 or 00158 ( ord( $multi_char[$offs + 2] ) & 0xc0 ) != 0x80 ) 00159 return $char_code; 00160 $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x0f ) << 12) + 00161 (( ord( $multi_char[$offs + 1] ) & 0x3f ) << 6) + 00162 (( ord( $multi_char[$offs + 2] ) & 0x3f )) ); 00163 if ( $char_code < 2048 ) // Illegal multibyte, should use less than 3 chars 00164 { 00165 $char_code == false; 00166 } 00167 } 00168 else if ( ( ord( $multi_char[$offs + 0] ) & 0xf8 ) == 0xf0 ) // 21 bit, 4 chars 00169 { 00170 $len = 4; 00171 if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 or 00172 ( ord( $multi_char[$offs + 2] ) & 0xc0 ) != 0x80 or 00173 ( ord( $multi_char[$offs + 3] ) & 0xc0 ) != 0x80 ) 00174 return $char_code; 00175 $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x07 ) << 18) + 00176 (( ord( $multi_char[$offs + 1] ) & 0x3f ) << 12) + 00177 (( ord( $multi_char[$offs + 2] ) & 0x3f ) << 6) + 00178 (( ord( $multi_char[$offs + 3] ) & 0x3f )) ); 00179 if ( $char_code < 65536 ) // Illegal multibyte, should use less than 4 chars 00180 { 00181 $char_code == false; 00182 } 00183 } 00184 else if ( ( ord( $multi_char[$offs + 0] ) & 0xfc ) == 0xf8 ) // 26 bit, 5 chars 00185 { 00186 $len = 5; 00187 if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 or 00188 ( ord( $multi_char[$offs + 2] ) & 0xc0 ) != 0x80 or 00189 ( ord( $multi_char[$offs + 3] ) & 0xc0 ) != 0x80 or 00190 ( ord( $multi_char[$offs + 4] ) & 0xc0 ) != 0x80 ) 00191 return $char_code; 00192 $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x03 ) << 24) + 00193 (( ord( $multi_char[$offs + 1] ) & 0x3f ) << 18) + 00194 (( ord( $multi_char[$offs + 2] ) & 0x3f ) << 12) + 00195 (( ord( $multi_char[$offs + 3] ) & 0x3f ) << 6) + 00196 (( ord( $multi_char[$offs + 4] ) & 0x3f )) ); 00197 if ( $char_code < 2097152 ) // Illegal multibyte, should use less than 5 chars 00198 { 00199 $char_code == false; 00200 } 00201 } 00202 else if ( ( ord( $multi_char[$offs + 0] ) & 0xfe ) == 0xfc ) // 31 bit, 6 chars 00203 { 00204 $len = 6; 00205 if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 or 00206 ( ord( $multi_char[$offs + 2] ) & 0xc0 ) != 0x80 or 00207 ( ord( $multi_char[$offs + 3] ) & 0xc0 ) != 0x80 or 00208 ( ord( $multi_char[$offs + 4] ) & 0xc0 ) != 0x80 or 00209 ( ord( $multi_char[$offs + 5] ) & 0xc0 ) != 0x80 ) 00210 return $char_code; 00211 $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x01 ) << 30) + 00212 (( ord( $multi_char[$offs + 1] ) & 0x3f ) << 24) + 00213 (( ord( $multi_char[$offs + 2] ) & 0x3f ) << 18) + 00214 (( ord( $multi_char[$offs + 3] ) & 0x3f ) << 12) + 00215 (( ord( $multi_char[$offs + 4] ) & 0x3f ) << 6) + 00216 (( ord( $multi_char[$offs + 5] ) & 0x3f )) ); 00217 if ( $char_code < 67108864 ) // Illegal multibyte, should use less than 6 chars 00218 { 00219 $char_code == false; 00220 } 00221 } 00222 return $char_code; 00223 } 00224 00225 static function utf8LengthTable() 00226 { 00227 if ( empty( $GLOBALS['eZUTF8LengthTable'] ) ) 00228 { 00229 $GLOBALS['eZUTF8LengthTable'] = 00230 array( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00231 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00232 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6 ); 00233 } 00234 return $GLOBALS['eZUTF8LengthTable']; 00235 } 00236 00237 static function characterByteLength( $str, $pos ) 00238 { 00239 $table = eZUTF8Codec::utf8LengthTable(); 00240 $char = ord( $str[$pos] ); 00241 return $table[($char >> 2) & 0x3f]; 00242 } 00243 00244 static function strlen( $str ) 00245 { 00246 $table = eZUTF8Codec::utf8LengthTable(); 00247 $len = strlen( $str ); 00248 $strlen = 0; 00249 for ( $i = 0; $i < $len; ) 00250 { 00251 $char = ord( $str[$i] ); 00252 $char_len = $table[($char >> 2) & 0x3f]; 00253 $i += $char_len; 00254 ++$strlen; 00255 } 00256 return $strlen; 00257 } 00258 00259 /*! 00260 \return a unique instance of the UTF8 codec. 00261 */ 00262 static function instance() 00263 { 00264 if ( empty( $GLOBALS['eZUTF8CodecInstance'] ) ) 00265 { 00266 $GLOBALS['eZUTF8CodecInstance'] = new eZUTF8Codec(); 00267 } 00268 return $GLOBALS['eZUTF8CodecInstance']; 00269 } 00270 } 00271 00272 ?>