eZ Publish  [4.0]
ezutf8codec.php
Go to the documentation of this file.
00001 <?php
00002 //
00003 // Definition of eZUTF8Codec class
00004 //
00005 // ## BEGIN COPYRIGHT, LICENSE AND WARRANTY NOTICE ##
00006 // SOFTWARE NAME: eZ Publish
00007 // SOFTWARE RELEASE: 4.0.x
00008 // COPYRIGHT NOTICE: Copyright (C) 1999-2008 eZ Systems AS
00009 // SOFTWARE LICENSE: GNU General Public License v2.0
00010 // NOTICE: >
00011 //   This program is free software; you can redistribute it and/or
00012 //   modify it under the terms of version 2.0  of the GNU General
00013 //   Public License as published by the Free Software Foundation.
00014 //
00015 //   This program is distributed in the hope that it will be useful,
00016 //   but WITHOUT ANY WARRANTY; without even the implied warranty of
00017 //   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018 //   GNU General Public License for more details.
00019 //
00020 //   You should have received a copy of version 2.0 of the GNU General
00021 //   Public License along with this program; if not, write to the Free
00022 //   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00023 //   MA 02110-1301, USA.
00024 //
00025 //
00026 // ## END COPYRIGHT, LICENSE AND WARRANTY NOTICE ##
00027 //
00028 
00029 /*!
00030   \class eZUTF8Codec ezutf8codec.php
00031   \ingroup eZI18N
00032   \brief Converter for utf8 and 32bit unicode
00033 
00034   Allows for conversion from utf8 charactes and to 32bit unicode values,
00035   and vice versa.
00036 
00037 */
00038 
00039 class eZUTF8Codec
00040 {
00041     /*!
00042      Initializes utf8 codec.
00043     */
00044     function eZUTF8Codec()
00045     {
00046     }
00047 
00048     /*!
00049      Converts an UTF8 string into Unicode values and returns an array with the values.
00050     */
00051     static function convertStringToUnicode( $str )
00052     {
00053         $unicodeValues = array();
00054         $strLen = strlen( $str );
00055         for ( $offset = 0; $offset < $strLen; )
00056         {
00057             $charLen = 1;
00058             $unicodeValue = eZUTF8Codec::fromUTF8( $str, $offset, $charLen );
00059             if ( $unicodeValue !== false )
00060                 $unicodeValues[] = $unicodeValue;
00061             $offset += $charLen;
00062         }
00063         return $unicodeValues;
00064     }
00065 
00066     /*!
00067      Converts an array with Unicode values into an UTF8 string and returns it.
00068     */
00069     static function convertUnicodeToString( $unicodeValues )
00070     {
00071         if ( !is_array( $unicodeValues ) )
00072             return false;
00073         $text = '';
00074         foreach ( $unicodeValues as $unicodeValue )
00075         {
00076             $utf8Char = eZUTF8Codec::toUTF8( $unicodeValue );
00077             $text .= $utf8Char;
00078         }
00079         return $text;
00080     }
00081 
00082     /*!
00083      \static
00084      Converts the 32 bit integer $char_code to a utf8 string representing the Unicode character.
00085     */
00086     static function toUTF8( $char_code )
00087     {
00088         switch ( $char_code )
00089         {
00090             case 0:
00091                 $char = chr( 0 );
00092             case !($char_code & 0xffffff80): // 7 bit
00093                 $char = chr( $char_code );
00094                 break;
00095             case !($char_code & 0xfffff800): // 11 bit
00096                 $char = ( chr(0xc0 | (($char_code >> 6) & 0x1f)) .
00097                           chr(0x80 | ($char_code & 0x3f)) );
00098                 break;
00099             case !($char_code & 0xffff0000): // 16 bit
00100                 $char = ( chr(0xe0 | (($char_code >> 12) & 0x0f)) .
00101                           chr(0x80 | (($char_code >> 6) & 0x3f)) .
00102                           chr(0x80 | ($char_code & 0x3f)) );
00103                 break;
00104             case !($char_code & 0xffe00000): // 21 bit
00105                 $char = ( chr(0xf0 | (($char_code >> 18) & 0x07)) .
00106                           chr(0x80 | (($char_code >> 12) & 0x3f)) .
00107                           chr(0x80 | (($char_code >> 6) & 0x3f)) .
00108                           chr(0x80 | ($char_code & 0x3f)) );
00109                 break;
00110             case !($char_code & 0xfc000000): // 26 bit
00111                 $char = ( chr(0xf8 | (($char_code >> 24) & 0x03)) .
00112                           chr(0x80 | (($char_code >> 18) & 0x3f)) .
00113                           chr(0x80 | (($char_code >> 12) & 0x3f)) .
00114                           chr(0x80 | (($char_code >> 6) & 0x3f)) .
00115                           chr(0x80 | ($char_code & 0x3f)) );
00116             default: // 31 bit
00117                 $char = ( chr(0xfc | (($char_code >> 30) & 0x01)) .
00118                           chr(0x80 | (($char_code >> 24) & 0x3f)) .
00119                           chr(0x80 | (($char_code >> 18) & 0x3f)) .
00120                           chr(0x80 | (($char_code >> 12) & 0x3f)) .
00121                           chr(0x80 | (($char_code >> 6) & 0x3f)) .
00122                           chr(0x80 | ($char_code & 0x3f)) );
00123         }
00124         return $char;
00125     }
00126 
00127     /*!
00128      \static
00129      Converts the first utf8 char in the string $multi_char to a 32 bit integer.
00130      $offs is the offset in the string.
00131      $len will contain the length of utf8 char in the string which can be used to
00132      find the next char.
00133     */
00134     static function fromUtf8( $multi_char, $offs, &$len )
00135     {
00136         $char_code = false;
00137         if ( ( ord( $multi_char[$offs + 0] ) & 0x80 ) == 0x00 ) // 7 bit, 1 char
00138         {
00139             $char_code = ord( $multi_char[$offs + 0] );
00140             $len = 1;
00141         }
00142         else if ( ( ord( $multi_char[$offs + 0] ) & 0xe0 ) == 0xc0 ) // 11 bit, 2 chars
00143         {
00144             $len = 2;
00145             if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 )
00146                 return $char_code;
00147             $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x1f ) << 6) +
00148                            (( ord( $multi_char[$offs + 1] ) & 0x3f )) );
00149             if ( $char_code < 128 ) // Illegal multibyte, should use less than 2 chars
00150             {
00151                 $char_code == false;
00152             }
00153         }
00154         else if ( ( ord( $multi_char[$offs + 0] ) & 0xf0 ) == 0xe0 ) // 16 bit, 3 chars
00155         {
00156             $len = 3;
00157             if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 or
00158                  ( ord( $multi_char[$offs + 2] ) & 0xc0 ) != 0x80 )
00159                 return $char_code;
00160             $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x0f ) << 12) +
00161                            (( ord( $multi_char[$offs + 1] ) & 0x3f ) << 6) +
00162                            (( ord( $multi_char[$offs + 2] ) & 0x3f )) );
00163             if ( $char_code < 2048 ) // Illegal multibyte, should use less than 3 chars
00164             {
00165                 $char_code == false;
00166             }
00167         }
00168         else if ( ( ord( $multi_char[$offs + 0] ) & 0xf8 ) == 0xf0 ) // 21 bit, 4 chars
00169         {
00170             $len = 4;
00171             if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 or
00172                  ( ord( $multi_char[$offs + 2] ) & 0xc0 ) != 0x80 or
00173                  ( ord( $multi_char[$offs + 3] ) & 0xc0 ) != 0x80 )
00174                 return $char_code;
00175             $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x07 ) << 18) +
00176                            (( ord( $multi_char[$offs + 1] ) & 0x3f ) << 12) +
00177                            (( ord( $multi_char[$offs + 2] ) & 0x3f ) << 6) +
00178                            (( ord( $multi_char[$offs + 3] ) & 0x3f )) );
00179             if ( $char_code < 65536 ) // Illegal multibyte, should use less than 4 chars
00180             {
00181                 $char_code == false;
00182             }
00183         }
00184         else if ( ( ord( $multi_char[$offs + 0] ) & 0xfc ) == 0xf8 ) // 26 bit, 5 chars
00185         {
00186             $len = 5;
00187             if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 or
00188                  ( ord( $multi_char[$offs + 2] ) & 0xc0 ) != 0x80 or
00189                  ( ord( $multi_char[$offs + 3] ) & 0xc0 ) != 0x80 or
00190                  ( ord( $multi_char[$offs + 4] ) & 0xc0 ) != 0x80 )
00191                 return $char_code;
00192             $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x03 ) << 24) +
00193                            (( ord( $multi_char[$offs + 1] ) & 0x3f ) << 18) +
00194                            (( ord( $multi_char[$offs + 2] ) & 0x3f ) << 12) +
00195                            (( ord( $multi_char[$offs + 3] ) & 0x3f ) << 6) +
00196                            (( ord( $multi_char[$offs + 4] ) & 0x3f )) );
00197             if ( $char_code < 2097152 ) // Illegal multibyte, should use less than 5 chars
00198             {
00199                 $char_code == false;
00200             }
00201         }
00202         else if ( ( ord( $multi_char[$offs + 0] ) & 0xfe ) == 0xfc ) // 31 bit, 6 chars
00203         {
00204             $len = 6;
00205             if ( ( ord( $multi_char[$offs + 1] ) & 0xc0 ) != 0x80 or
00206                  ( ord( $multi_char[$offs + 2] ) & 0xc0 ) != 0x80 or
00207                  ( ord( $multi_char[$offs + 3] ) & 0xc0 ) != 0x80 or
00208                  ( ord( $multi_char[$offs + 4] ) & 0xc0 ) != 0x80 or
00209                  ( ord( $multi_char[$offs + 5] ) & 0xc0 ) != 0x80 )
00210                 return $char_code;
00211             $char_code = ( (( ord( $multi_char[$offs + 0] ) & 0x01 ) << 30) +
00212                            (( ord( $multi_char[$offs + 1] ) & 0x3f ) << 24) +
00213                            (( ord( $multi_char[$offs + 2] ) & 0x3f ) << 18) +
00214                            (( ord( $multi_char[$offs + 3] ) & 0x3f ) << 12) +
00215                            (( ord( $multi_char[$offs + 4] ) & 0x3f ) << 6) +
00216                            (( ord( $multi_char[$offs + 5] ) & 0x3f )) );
00217             if ( $char_code < 67108864 ) // Illegal multibyte, should use less than 6 chars
00218             {
00219                 $char_code == false;
00220             }
00221         }
00222         return $char_code;
00223     }
00224 
00225     static function utf8LengthTable()
00226     {
00227         if ( empty( $GLOBALS['eZUTF8LengthTable'] ) )
00228         {
00229             $GLOBALS['eZUTF8LengthTable'] =
00230                 array( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00231                        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00232                        0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6 );
00233         }
00234         return $GLOBALS['eZUTF8LengthTable'];
00235     }
00236 
00237     static function characterByteLength( $str, $pos )
00238     {
00239         $table = eZUTF8Codec::utf8LengthTable();
00240         $char = ord( $str[$pos] );
00241         return $table[($char >> 2) & 0x3f];
00242     }
00243 
00244     static function strlen( $str )
00245     {
00246         $table = eZUTF8Codec::utf8LengthTable();
00247         $len = strlen( $str );
00248         $strlen = 0;
00249         for ( $i = 0; $i < $len; )
00250         {
00251             $char = ord( $str[$i] );
00252             $char_len = $table[($char >> 2) & 0x3f];
00253             $i += $char_len;
00254             ++$strlen;
00255         }
00256         return $strlen;
00257     }
00258 
00259     /*!
00260      \return a unique instance of the UTF8 codec.
00261     */
00262     static function instance()
00263     {
00264         if ( empty( $GLOBALS['eZUTF8CodecInstance'] ) )
00265         {
00266             $GLOBALS['eZUTF8CodecInstance'] = new eZUTF8Codec();
00267         }
00268         return $GLOBALS['eZUTF8CodecInstance'];
00269     }
00270 }
00271 
00272 ?>