00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00136 class t3lib_cs {
00137 var $noCharByteVal=63;
00138
00139
00140 var $parsedCharsets=array();
00141
00142
00143 var $caseFolding=array();
00144
00145
00146 var $toASCII=array();
00147
00148
00149 var $twoByteSets=array(
00150 'ucs-2'=>1,
00151 );
00152
00153
00154 var $fourByteSets=array(
00155 'ucs-4'=>1,
00156 'utf-32'=>1,
00157 );
00158
00159
00160 var $eucBasedSets=array(
00161 'gb2312'=>1,
00162 'big5'=>1,
00163 'euc-kr'=>1,
00164 'shift_jis'=>1,
00165 );
00166
00167
00168
00169 var $synonyms=array(
00170 'us' => 'ascii',
00171 'us-ascii'=> 'ascii',
00172 'cp819' => 'iso-8859-1',
00173 'ibm819' => 'iso-8859-1',
00174 'iso-ir-100' => 'iso-8859-1',
00175 'iso-ir-109' => 'iso-8859-2',
00176 'iso-ir-148' => 'iso-8859-9',
00177 'iso-ir-199' => 'iso-8859-14',
00178 'iso-ir-203' => 'iso-8859-15',
00179 'csisolatin1' => 'iso-8859-1',
00180 'csisolatin2' => 'iso-8859-2',
00181 'csisolatin3' => 'iso-8859-3',
00182 'csisolatin5' => 'iso-8859-9',
00183 'csisolatin8' => 'iso-8859-14',
00184 'csisolatin9' => 'iso-8859-15',
00185 'csisolatingreek' => 'iso-8859-7',
00186 'iso-celtic' => 'iso-8859-14',
00187 'latin1' => 'iso-8859-1',
00188 'latin2' => 'iso-8859-2',
00189 'latin3' => 'iso-8859-3',
00190 'latin5' => 'iso-8859-9',
00191 'latin6' => 'iso-8859-10',
00192 'latin8' => 'iso-8859-14',
00193 'latin9' => 'iso-8859-15',
00194 'l1' => 'iso-8859-1',
00195 'l2' => 'iso-8859-2',
00196 'l3' => 'iso-8859-3',
00197 'l5' => 'iso-8859-9',
00198 'l6' => 'iso-8859-10',
00199 'l8' => 'iso-8859-14',
00200 'l9' => 'iso-8859-15',
00201 'cyrillic' => 'iso-8859-5',
00202 'arabic' => 'iso-8859-6',
00203 'tis-620' => 'iso-8859-11',
00204 'win874' => 'windows-874',
00205 'win1250' => 'windows-1250',
00206 'win1251' => 'windows-1251',
00207 'win1252' => 'windows-1252',
00208 'win1253' => 'windows-1253',
00209 'win1254' => 'windows-1254',
00210 'win1255' => 'windows-1255',
00211 'win1256' => 'windows-1256',
00212 'win1257' => 'windows-1257',
00213 'win1258' => 'windows-1258',
00214 'cp1250' => 'windows-1250',
00215 'cp1251' => 'windows-1251',
00216 'cp1252' => 'windows-1252',
00217 'ms-ee' => 'windows-1250',
00218 'ms-ansi' => 'windows-1252',
00219 'ms-greek' => 'windows-1253',
00220 'ms-turk' => 'windows-1254',
00221 'winbaltrim' => 'windows-1257',
00222 'koi-8ru' => 'koi-8r',
00223 'koi8r' => 'koi-8r',
00224 'cp878' => 'koi-8r',
00225 'mac' => 'macroman',
00226 'macintosh' => 'macroman',
00227 'euc-cn' => 'gb2312',
00228 'x-euc-cn' => 'gb2312',
00229 'euccn' => 'gb2312',
00230 'cp936' => 'gb2312',
00231 'big-5' => 'big5',
00232 'cp950' => 'big5',
00233 'eucjp' => 'euc-jp',
00234 'sjis' => 'shift_jis',
00235 'shift-jis' => 'shift_jis',
00236 'cp932' => 'shift_jis',
00237 'cp949' => 'euc-kr',
00238 'utf7' => 'utf-7',
00239 'utf8' => 'utf-8',
00240 'utf16' => 'utf-16',
00241 'utf32' => 'utf-32',
00242 'utf8' => 'utf-8',
00243 'ucs2' => 'ucs-2',
00244 'ucs4' => 'ucs-4',
00245 );
00246
00247
00248 var $lang_to_langfamily=array(
00249
00250
00251
00252 'ar' => 'arabic',
00253 'bg' => 'cyrillic',
00254 'cs' => 'east_european',
00255 'da' => 'west_european',
00256 'de' => 'west_european',
00257 'es' => 'west_european',
00258 'et' => 'estonian',
00259 'eu' => 'west_european',
00260 'fi' => 'west_european',
00261 'fr' => 'west_european',
00262 'gr' => 'greek',
00263 'hr' => 'east_european',
00264 'hu' => 'east_european',
00265 'iw' => 'hebrew',
00266 'is' => 'west_european',
00267 'it' => 'west_european',
00268 'ja' => 'japanese',
00269 'kl' => 'west_european',
00270 'ko' => 'korean',
00271 'lt' => 'lithuanian',
00272 'lv' => 'west_european',
00273 'nl' => 'west_european',
00274 'no' => 'west_european',
00275 'pl' => 'east_european',
00276 'pt' => 'west_european',
00277 'ro' => 'east_european',
00278 'ru' => 'cyrillic',
00279 'sk' => 'east_european',
00280 'sl' => 'east_european',
00281 'sv' => 'west_european',
00282 'th' => 'thai',
00283 'uk' => 'cyrillic',
00284 'vi' => 'vietnamese',
00285 'zh' => 'chinese',
00286
00287 'chs' => 'simpl_chinese',
00288 'cht' => 'trad_chinese',
00289 'csy' => 'east_european',
00290 'dan' => 'west_european',
00291 'deu' => 'west_european',
00292 'dea' => 'west_european',
00293 'des' => 'west_european',
00294 'ena' => 'west_european',
00295 'enc' => 'west_european',
00296 'eng' => 'west_european',
00297 'enz' => 'west_european',
00298 'enu' => 'west_european',
00299 'nld' => 'west_european',
00300 'nlb' => 'west_european',
00301 'fin' => 'west_european',
00302 'fra' => 'west_european',
00303 'frb' => 'west_european',
00304 'frc' => 'west_european',
00305 'frs' => 'west_european',
00306 'ell' => 'greek',
00307 'hun' => 'east_european',
00308 'isl' => 'west_euorpean',
00309 'ita' => 'west_european',
00310 'its' => 'west_european',
00311 'jpn' => 'japanese',
00312 'kor' => 'korean',
00313 'nor' => 'west_european',
00314 'non' => 'west_european',
00315 'plk' => 'east_european',
00316 'ptg' => 'west_european',
00317 'ptb' => 'west_european',
00318 'rus' => 'east_european',
00319 'sky' => 'east_european',
00320 'esp' => 'west_european',
00321 'esm' => 'west_european',
00322 'esn' => 'west_european',
00323 'sve' => 'west_european',
00324 'trk' => 'turkish',
00325
00326 'bulgarian' => 'east_european',
00327 'catalan' => 'west_european',
00328 'croatian' => 'east_european',
00329 'czech' => 'east_european',
00330 'danish' => 'west_european',
00331 'dutch' => 'west_european',
00332 'english' => 'west_european',
00333 'finnish' => 'west_european',
00334 'french' => 'west_european',
00335 'galician' => 'west_european',
00336 'german' => 'west_european',
00337 'hungarian' => 'east_european',
00338 'icelandic' => 'west_european',
00339 'italian' => 'west_european',
00340 'latvian' => 'west_european',
00341 'lettish' => 'west_european',
00342 'norwegian' => 'west_european',
00343 'polish' => 'east_european',
00344 'portuguese' => 'west_european',
00345 'russian' => 'cyrillic',
00346 'romanian' => 'east_european',
00347 'slovak' => 'east_european',
00348 'slovenian' => 'east_european',
00349 'spanish' => 'west_european',
00350 'svedish' => 'west_european',
00351 'turkish' => 'east_european',
00352 'ukrainian' => 'cyrillic',
00353 );
00354
00355
00356 var $lang_to_charset_unix=array(
00357 'west_european' => 'iso-8859-1',
00358 'estonian' => 'iso-8859-1',
00359 'east_european' => 'iso-8859-2',
00360 'baltic' => 'iso-8859-4',
00361 'cyrillic' => 'iso-8859-5',
00362 'arabic' => 'iso-8859-6',
00363 'greek' => 'iso-8859-7',
00364 'hebrew' => 'iso-8859-8',
00365 'turkish' => 'iso-8859-9',
00366 'thai' => 'iso-8859-11',
00367 'lithuanian' => 'iso-8859-13',
00368 'chinese' => 'gb2312',
00369 'japanese' => 'euc-jp',
00370 'korean' => 'euc-kr',
00371 'simpl_chinese' => 'gb2312',
00372 'trad_chinese' => 'big5',
00373 'vietnamese' => '',
00374 );
00375
00376
00377 var $lang_to_charset_windows=array(
00378 'east_european' => 'windows-1250',
00379 'cyrillic' => 'windows-1251',
00380 'west_european' => 'windows-1252',
00381 'greek' => 'windows-1253',
00382 'turkish' => 'windows-1254',
00383 'hebrew' => 'windows-1255',
00384 'arabic' => 'windows-1256',
00385 'baltic' => 'windows-1257',
00386 'estonian' => 'windows-1257',
00387 'lithuanian' => 'windows-1257',
00388 'vietnamese' => 'windows-1258',
00389 'thai' => 'cp874',
00390 'korean' => 'cp949',
00391 'chinese' => 'gb2312',
00392 'japanese' => 'shift_jis',
00393 'simpl_chinese' => 'gb2312',
00394 'trad_chinese' => 'big5',
00395 );
00396
00397
00398 var $locale_to_charset=array(
00399 'japanese.euc' => 'euc-jp',
00400 'ja_jp.ujis' => 'euc-jp',
00401 'korean.euc' => 'euc-kr',
00402 'zh_cn' => 'gb2312',
00403 'zh_hk' => 'big5',
00404 'zh_tw' => 'big5',
00405 );
00406
00407
00408
00409 var $charSetArray = array(
00410 'dk' => '',
00411 'de' => '',
00412 'no' => '',
00413 'it' => '',
00414 'fr' => '',
00415 'es' => '',
00416 'nl' => '',
00417 'cz' => 'windows-1250',
00418 'pl' => 'iso-8859-2',
00419 'si' => 'windows-1250',
00420 'fi' => '',
00421 'tr' => 'iso-8859-9',
00422 'se' => '',
00423 'pt' => '',
00424 'ru' => 'windows-1251',
00425 'ro' => 'iso-8859-2',
00426 'ch' => 'gb2312',
00427 'sk' => 'windows-1250',
00428 'lt' => 'windows-1257',
00429 'is' => 'utf-8',
00430 'hr' => 'windows-1250',
00431 'hu' => 'iso-8859-2',
00432 'gl' => '',
00433 'th' => 'iso-8859-11',
00434 'gr' => 'iso-8859-7',
00435 'hk' => 'big5',
00436 'eu' => '',
00437 'bg' => 'windows-1251',
00438 'br' => '',
00439 'et' => 'iso-8859-4',
00440 'ar' => 'iso-8859-6',
00441 'he' => 'utf-8',
00442 'ua' => 'windows-1251',
00443 'jp' => 'shift_jis',
00444 'lv' => 'utf-8',
00445 'vn' => 'utf-8',
00446 'ca' => 'iso-8859-15',
00447 'ba' => 'iso-8859-2',
00448 'kr' => 'euc-kr',
00449 'eo' => 'utf-8',
00450 'my' => '',
00451 'hi' => 'utf-8',
00452 );
00453
00454
00455
00456 var $isoArray = array(
00457 'dk' => 'da',
00458 'de' => '',
00459 'no' => '',
00460 'it' => '',
00461 'fr' => '',
00462 'es' => '',
00463 'nl' => '',
00464 'cz' => 'cs',
00465 'pl' => '',
00466 'si' => 'sl',
00467 'fi' => '',
00468 'tr' => '',
00469 'se' => 'sv',
00470 'pt' => '',
00471 'ru' => '',
00472 'ro' => '',
00473 'ch' => 'zh_CN',
00474 'sk' => '',
00475 'lt' => '',
00476 'is' => '',
00477 'hr' => '',
00478 'hu' => '',
00479 'gl' => '',
00480 'th' => '',
00481 'gr' => 'el',
00482 'hk' => 'zh_HK',
00483 'eu' => '',
00484 'bg' => '',
00485 'br' => 'pt_BR',
00486 'et' => '',
00487 'ar' => '',
00488 'he' => 'iw',
00489 'ua' => 'uk',
00490 'jp' => 'ja',
00491 'lv' => '',
00492 'vn' => 'vi',
00493 'ca' => '',
00494 'ba' => '',
00495 'kr' => '',
00496 );
00497
00505 function parse_charset($charset) {
00506 $charset = strtolower($charset);
00507 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
00508
00509 return $charset;
00510 }
00511
00524 function get_locale_charset($locale) {
00525 $locale = strtolower($locale);
00526
00527
00528 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
00529
00530
00531 list($locale,$modifier) = explode('@',$locale);
00532
00533
00534 list($locale,$charset) = explode('.',$locale);
00535 if ($charset) return $this->parse_charset($charset);
00536
00537
00538 if ($modifier == 'euro') return 'iso-8859-15';
00539
00540
00541 list($language,$country) = explode('_',$locale);
00542 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language];
00543
00544 if (TYPO3_OS == 'WIN') {
00545 $cs = $this->lang_to_charset_windows[$language];
00546 } else {
00547 $cs = $this->lang_to_charset_unix[$language];
00548 }
00549
00550 return $cs ? $cs : 'iso-8859-1';
00551 }
00552
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564
00565
00566
00577 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
00578 if ($fromCS==$toCS) return $str;
00579
00580
00581 if ($toCS=='utf-8' || !$useEntityForNoChar) {
00582 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
00583 case 'mbstring':
00584 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
00585 if (false !== $conv_str) return $conv_str;
00586 break;
00587
00588 case 'iconv':
00589 $conv_str = iconv($fromCS,$toCS.'
00590 if (false !== $conv_str) return $conv_str;
00591 break;
00592
00593 case 'recode':
00594 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
00595 if (false !== $conv_str) return $conv_str;
00596 break;
00597 }
00598
00599 }
00600
00601 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
00602 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
00603 return $str;
00604 }
00605
00617 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
00618 foreach($array as $key => $value) {
00619 if (is_array($array[$key])) {
00620 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00621 } else {
00622 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00623 }
00624 }
00625 }
00626
00634 function utf8_encode($str,$charset) {
00635
00636 if ($charset === 'utf-8') return $str;
00637
00638
00639 if ($this->initCharset($charset)) {
00640 $strLen = strlen($str);
00641 $outStr='';
00642
00643 for ($a=0;$a<$strLen;$a++) {
00644 $chr=substr($str,$a,1);
00645 $ord=ord($chr);
00646 if (isset($this->twoByteSets[$charset])) {
00647 $ord2 = ord($str{$a+1});
00648 $ord = $ord<<8 & $ord2;
00649
00650 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
00651 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
00652 } else $outStr.=chr($this->noCharByteVal);
00653 $a++;
00654 } elseif ($ord>127) {
00655 if (isset($this->eucBasedSets[$charset])) {
00656 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) {
00657 $a++;
00658 $ord2=ord(substr($str,$a,1));
00659 $ord = $ord*256+$ord2;
00660 }
00661 }
00662
00663 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
00664 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
00665 } else $outStr.= chr($this->noCharByteVal);
00666 } else $outStr.= $chr;
00667 }
00668 return $outStr;
00669 }
00670 }
00671
00680 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
00681
00682
00683 if ($this->initCharset($charset)) {
00684 $strLen = strlen($str);
00685 $outStr='';
00686 $buf='';
00687 for ($a=0,$i=0;$a<$strLen;$a++,$i++) {
00688 $chr=substr($str,$a,1);
00689 $ord=ord($chr);
00690 if ($ord>127) {
00691 if ($ord & 64) {
00692
00693 $buf=$chr;
00694 for ($b=0;$b<8;$b++) {
00695 $ord = $ord << 1;
00696 if ($ord & 128) {
00697 $a++;
00698 $buf.=substr($str,$a,1);
00699 } else break;
00700 }
00701
00702 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
00703 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
00704 if ($mByte>255) {
00705 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
00706 } else $outStr.= chr($mByte);
00707 } elseif ($useEntityForNoChar) {
00708 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00709 } else $outStr.=chr($this->noCharByteVal);
00710 } else $outStr.=chr($this->noCharByteVal);
00711 } else $outStr.=$chr;
00712 }
00713 return $outStr;
00714 }
00715 }
00716
00723 function utf8_to_entities($str) {
00724 $strLen = strlen($str);
00725 $outStr='';
00726 $buf='';
00727 for ($a=0;$a<$strLen;$a++) {
00728 $chr=substr($str,$a,1);
00729 $ord=ord($chr);
00730 if ($ord>127) {
00731 if ($ord & 64) {
00732 $buf=$chr;
00733 for ($b=0;$b<8;$b++) {
00734 $ord = $ord << 1;
00735 if ($ord & 128) {
00736 $a++;
00737 $buf.=substr($str,$a,1);
00738 } else break;
00739 }
00740
00741 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00742 } else $outStr.=chr($this->noCharByteVal);
00743 } else $outStr.=$chr;
00744 }
00745
00746 return $outStr;
00747 }
00748
00756 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
00757 if ($alsoStdHtmlEnt) {
00758 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
00759 }
00760
00761 $token = md5(microtime());
00762 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
00763 foreach($parts as $k => $v) {
00764 if ($k%2) {
00765 if (substr($v,0,1)=='#') {
00766 if (substr($v,1,1)=='x') {
00767 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
00768 } else {
00769 $parts[$k] = $this->UnumberToChar(substr($v,1));
00770 }
00771 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {
00772 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
00773 } else {
00774 $parts[$k] ='&'.$v.';';
00775 }
00776 }
00777 }
00778
00779 return implode('',$parts);
00780 }
00781
00790 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
00791
00792 if ($convEntities) {
00793 $str = $this->entities_to_utf8($str,1);
00794 }
00795
00796 $strLen = strlen($str);
00797 $outArr=array();
00798 $buf='';
00799 for ($a=0;$a<$strLen;$a++) {
00800 $chr=substr($str,$a,1);
00801 $ord=ord($chr);
00802 if ($ord>127) {
00803 if ($ord & 64) {
00804 $buf=$chr;
00805 for ($b=0;$b<8;$b++) {
00806 $ord = $ord << 1;
00807 if ($ord & 128) {
00808 $a++;
00809 $buf.=substr($str,$a,1);
00810 } else break;
00811 }
00812
00813 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
00814 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;
00815 } else $outArr[]=$retChar?chr($ord):$ord;
00816 }
00817
00818 return $outArr;
00819 }
00820
00840 function UnumberToChar($cbyte) {
00841 $str='';
00842
00843 if ($cbyte < 0x80) {
00844 $str.=chr($cbyte);
00845 } else if ($cbyte < 0x800) {
00846 $str.=chr(0xC0 | ($cbyte >> 6));
00847 $str.=chr(0x80 | ($cbyte & 0x3F));
00848 } else if ($cbyte < 0x10000) {
00849 $str.=chr(0xE0 | ($cbyte >> 12));
00850 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00851 $str.=chr(0x80 | ($cbyte & 0x3F));
00852 } else if ($cbyte < 0x200000) {
00853 $str.=chr(0xF0 | ($cbyte >> 18));
00854 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00855 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00856 $str.=chr(0x80 | ($cbyte & 0x3F));
00857 } else if ($cbyte < 0x4000000) {
00858 $str.=chr(0xF8 | ($cbyte >> 24));
00859 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00860 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00861 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00862 $str.=chr(0x80 | ($cbyte & 0x3F));
00863 } else if ($cbyte < 0x80000000) {
00864 $str.=chr(0xFC | ($cbyte >> 30));
00865 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
00866 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00867 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00868 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00869 $str.=chr(0x80 | ($cbyte & 0x3F));
00870 } else {
00871 $str .= chr($this->noCharByteVal);
00872 }
00873 return $str;
00874 }
00875
00885 function utf8CharToUnumber($str,$hex=0) {
00886 $ord=ord(substr($str,0,1));
00887
00888 if (($ord & 192) == 192) {
00889 $binBuf='';
00890 for ($b=0;$b<8;$b++) {
00891 $ord = $ord << 1;
00892 if ($ord & 128) {
00893 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
00894 } else break;
00895 }
00896 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
00897
00898 $int = bindec($binBuf);
00899 } else $int = $ord;
00900
00901 return $hex ? 'x'.dechex($int) : $int;
00902 }
00903
00904
00905
00906
00907
00908
00909
00910
00911
00912
00913
00914
00915
00916
00917
00928 function initCharset($charset) {
00929
00930 if (!is_array($this->parsedCharsets[$charset])) {
00931
00932
00933 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
00934
00935
00936 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
00937
00938
00939 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
00940 if ($cacheFile && @is_file($cacheFile)) {
00941 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
00942 } else {
00943
00944 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
00945
00946 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
00947
00948 $detectedType='';
00949 foreach($lines as $value) {
00950 if (trim($value) && substr($value,0,1)!='#') {
00951
00952
00953
00954 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
00955
00956 if ($detectedType=='ms-token') {
00957 list($hexbyte,$utf8) = split('=|:',$value,3);
00958 } elseif ($detectedType=='whitespaced') {
00959 $regA=array();
00960 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
00961 $hexbyte = $regA[1];
00962 $utf8 = 'U+'.$regA[2];
00963 }
00964 $decval = hexdec(trim($hexbyte));
00965 if ($decval>127) {
00966 $utf8decval = hexdec(substr(trim($utf8),2));
00967 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
00968 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
00969 }
00970 }
00971 }
00972 if ($cacheFile) {
00973 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
00974 }
00975 }
00976 return 2;
00977 } else return false;
00978 } else return 1;
00979 }
00980
00990 function initUnicodeData($mode=null) {
00991
00992 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
00993 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
00994
00995
00996 switch($mode) {
00997 case 'case':
00998 if (is_array($this->caseFolding['utf-8'])) return 1;
00999
01000
01001 if ($cacheFileCase && @is_file($cacheFileCase)) {
01002 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
01003 return 2;
01004 }
01005 break;
01006
01007 case 'ascii':
01008 if (is_array($this->toASCII['utf-8'])) return 1;
01009
01010
01011 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
01012 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
01013 return 2;
01014 }
01015 break;
01016 }
01017
01018
01019 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
01020 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
01021
01022 $fh = fopen($unicodeDataFile,'rb');
01023 if (!$fh) return false;
01024
01025
01026
01027 $this->caseFolding['utf-8'] = array();
01028 $utf8CaseFolding =& $this->caseFolding['utf-8'];
01029 $utf8CaseFolding['toUpper'] = array();
01030 $utf8CaseFolding['toLower'] = array();
01031 $utf8CaseFolding['toTitle'] = array();
01032
01033 $decomposition = array();
01034 $mark = array();
01035 $number = array();
01036 $omit = array();
01037
01038 while (!feof($fh)) {
01039 $line = fgets($fh,4096);
01040
01041 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
01042
01043 $ord = hexdec($char);
01044 if ($ord > 0xFFFF) break;
01045
01046 $utf8_char = $this->UnumberToChar($ord);
01047
01048 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
01049 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
01050
01051 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
01052
01053 switch ($cat{0}) {
01054 case 'M':
01055 $mark["U+$char"] = 1;
01056 break;
01057
01058 case 'N':
01059 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
01060 }
01061
01062
01063 $match = array();
01064 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
01065 $c = ord($match[2]);
01066 if ($match[1] == 'SMALL') $c += 32;
01067
01068 $decomposition["U+$char"] = array(dechex($c));
01069 continue;
01070 }
01071
01072 $match = array();
01073 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
01074 switch($match[1]) {
01075 case '<circle>':
01076 $match[2] = '0028 '.$match[2].' 0029';
01077 break;
01078
01079 case '<square>':
01080 $match[2] = '005B '.$match[2].' 005D';
01081 break;
01082
01083 case '<compat>':
01084 if (ereg('^0020 ',$match[2])) continue 2;
01085 break;
01086
01087
01088 case '<initial>':
01089 case '<medial>':
01090 case '<final>':
01091 case '<isolated>':
01092 case '<vertical>':
01093 continue 2;
01094 }
01095 $decomposition["U+$char"] = split(' ',$match[2]);
01096 }
01097 }
01098 fclose($fh);
01099
01100
01101 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
01102 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
01103 $fh = fopen($specialCasingFile,'rb');
01104 if ($fh) {
01105 while (!feof($fh)) {
01106 $line = fgets($fh,4096);
01107 if ($line{0} != '#' && trim($line) != '') {
01108
01109 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
01110 if ($cond == '' || $cond{0} == '#') {
01111 $utf8_char = $this->UnumberToChar(hexdec($char));
01112 if ($char != $lower) {
01113 $arr = split(' ',$lower);
01114 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01115 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
01116 }
01117 if ($char != $title && $title != $upper) {
01118 $arr = split(' ',$title);
01119 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01120 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
01121 }
01122 if ($char != $upper) {
01123 $arr = split(' ',$upper);
01124 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01125 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
01126 }
01127 }
01128 }
01129 }
01130 fclose($fh);
01131 }
01132 }
01133
01134
01135 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
01136 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
01137 $fh = fopen($customTranslitFile,'rb');
01138 if ($fh) {
01139 while (!feof($fh)) {
01140 $line = fgets($fh,4096);
01141 if ($line{0} != '#' && trim($line) != '') {
01142 list($char,$translit) = t3lib_div::trimExplode(';', $line);
01143 if (!$translit) $omit["U+$char"] = 1;
01144 $decomposition["U+$char"] = split(' ', $translit);
01145
01146 }
01147 }
01148 fclose($fh);
01149 }
01150 }
01151
01152
01153 foreach($decomposition as $from => $to) {
01154 $code_decomp = array();
01155
01156 while ($code_value = array_shift($to)) {
01157 if (isset($decomposition["U+$code_value"])) {
01158 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
01159 array_unshift($to, $cv);
01160 }
01161 } elseif (!isset($mark["U+$code_value"])) {
01162 array_push($code_decomp, $code_value);
01163 }
01164 }
01165 if (count($code_decomp) || isset($omit[$from])) {
01166 $decomposition[$from] = $code_decomp;
01167 } else {
01168 unset($decomposition[$from]);
01169 }
01170 }
01171
01172
01173 $this->toASCII['utf-8'] = array();
01174 $ascii =& $this->toASCII['utf-8'];
01175
01176 foreach($decomposition as $from => $to) {
01177 $code_decomp = array();
01178 while ($code_value = array_shift($to)) {
01179 $ord = hexdec($code_value);
01180 if ($ord > 127)
01181 continue 2;
01182 else
01183 array_push($code_decomp,chr($ord));
01184 }
01185 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
01186 }
01187
01188
01189 foreach($number as $from => $to) {
01190 $utf8_char = $this->UnumberToChar(hexdec($from));
01191 if (!isset($ascii[$utf8_char])) {
01192 $ascii[$utf8_char] = $to;
01193 }
01194 }
01195
01196 if ($cacheFileCase) {
01197 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
01198 }
01199
01200 if ($cacheFileASCII) {
01201 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
01202 }
01203
01204 return 3;
01205 }
01206
01215 function initCaseFolding($charset) {
01216
01217 if (is_array($this->caseFolding[$charset])) return 1;
01218
01219
01220 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
01221 if ($cacheFile && @is_file($cacheFile)) {
01222 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01223 return 2;
01224 }
01225
01226
01227 if (!$this->initCharset($charset)) {
01228 return false;
01229 }
01230
01231
01232 if (!$this->initUnicodeData('case')) {
01233 return false;
01234 }
01235
01236 $nochar = chr($this->noCharByteVal);
01237 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
01238
01239 $c = $this->utf8_decode($utf8, $charset);
01240
01241
01242 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
01243 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
01244
01245
01246 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
01247 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
01248
01249
01250 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
01251 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
01252 }
01253
01254
01255 for ($i=ord('a'); $i<=ord('z'); $i++) {
01256 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
01257 }
01258 for ($i=ord('A'); $i<=ord('Z'); $i++) {
01259 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
01260 }
01261
01262 if ($cacheFile) {
01263 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
01264 }
01265
01266 return 3;
01267 }
01268
01277 function initToASCII($charset) {
01278
01279 if (is_array($this->toASCII[$charset])) return 1;
01280
01281
01282 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
01283 if ($cacheFile && @is_file($cacheFile)) {
01284 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01285 return 2;
01286 }
01287
01288
01289 if (!$this->initCharset($charset)) {
01290 return false;
01291 }
01292
01293
01294 if (!$this->initUnicodeData('ascii')) {
01295 return false;
01296 }
01297
01298 $nochar = chr($this->noCharByteVal);
01299 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
01300
01301 $c = $this->utf8_decode($utf8, $charset);
01302
01303 if (isset($this->toASCII['utf-8'][$utf8])) {
01304 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
01305 }
01306 }
01307
01308 if ($cacheFile) {
01309 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
01310 }
01311
01312 return 3;
01313 }
01314
01315
01316
01317
01318
01319
01320
01321
01322
01323
01324
01325
01326
01327
01328
01329
01330
01331
01332
01333
01334
01335
01348 function substr($charset,$string,$start,$len=null) {
01349 if ($len===0) return '';
01350
01351 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01352
01353 if ($len==null) {
01354 $enc = mb_internal_encoding();
01355 mb_internal_encoding('utf-8');
01356 $str = mb_substr($string,$start);
01357 mb_internal_encoding($enc);
01358
01359 return $str;
01360 }
01361 else return mb_substr($string,$start,$len,'utf-8');
01362 } elseif ($charset == 'utf-8') {
01363 return $this->utf8_substr($string,$start,$len);
01364 } elseif ($this->eucBasedSets[$charset]) {
01365 return $this->euc_substr($string,$start,$charset,$len);
01366 } elseif ($this->twoByteSets[$charset]) {
01367 return substr($string,$start*2,$len*2);
01368 } elseif ($this->fourByteSets[$charset]) {
01369 return substr($string,$start*4,$len*4);
01370 }
01371
01372
01373 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
01374 }
01375
01386 function strlen($charset,$string) {
01387 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01388 return mb_strlen($string,$charset);
01389 } elseif ($charset == 'utf-8') {
01390 return $this->utf8_strlen($string);
01391 } elseif ($this->eucBasedSets[$charset]) {
01392 return $this->euc_strlen($string,$charset);
01393 } elseif ($this->twoByteSets[$charset]) {
01394 return strlen($string)/2;
01395 } elseif ($this->fourByteSets[$charset]) {
01396 return strlen($string)/4;
01397 }
01398
01399 return strlen($string);
01400 }
01401
01414 function crop($charset,$string,$len,$crop='') {
01415 if (intval($len) == 0) return $string;
01416
01417 if ($charset == 'utf-8') {
01418 $i = $this->utf8_char2byte_pos($string,$len);
01419 } elseif ($this->eucBasedSets[$charset]) {
01420 $i = $this->euc_char2byte_pos($string,$len,$charset);
01421 } else {
01422 if ($len > 0) {
01423 $i = $len;
01424 } else {
01425 $i = strlen($string)+$len;
01426 if ($i<=0) $i = false;
01427 }
01428 }
01429
01430 if ($i === false) {
01431 return $string;
01432 } else {
01433 if ($len > 0) {
01434 if (strlen($string{$i})) {
01435 return substr($string,0,$i).$crop;
01436
01437 }
01438 } else {
01439 if (strlen($string{$i-1})) {
01440 return $crop.substr($string,$i);
01441 }
01442 }
01443
01444
01445
01446
01447
01448
01449
01450
01451
01452
01453 }
01454 return $string;
01455 }
01456
01467 function strtrunc($charset,$string,$len) {
01468 if ($len <= 0) return '';
01469
01470 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01471 return mb_strcut($string,0,$len,$charset);
01472 } elseif ($charset == 'utf-8') {
01473 return $this->utf8_strtrunc($string,$len);
01474 } elseif ($this->eucBasedSets[$charset]) {
01475 return $this->euc_strtrunc($string,$charset);
01476 } elseif ($this->twoByteSets[$charset]) {
01477 if ($len % 2) $len--;
01478 } elseif ($this->fourByteSets[$charset]) {
01479 $x = $len % 4;
01480 $len -= $x;
01481 }
01482
01483 return substr($string,0,$len);
01484 }
01485
01501 function conv_case($charset,$string,$case) {
01502 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3) {
01503 if ($case == 'toLower') {
01504 return mb_strtolower($string,'utf-8');
01505 } else {
01506 return mb_strtoupper($string,'utf-8');
01507 }
01508 } elseif ($charset == 'utf-8') {
01509 return $this->utf8_char_mapping($string,'case',$case);
01510 } elseif (isset($this->eucBasedSets[$charset])) {
01511 return $this->euc_char_mapping($string,$charset,'case',$case);
01512 } else {
01513
01514 return $this->sb_char_mapping($string,$charset,'case',$case);
01515 }
01516
01517 return $string;
01518 }
01519
01527 function specCharsToASCII($charset,$string) {
01528 if ($charset == 'utf-8') {
0152