Files
wy/ZeedFramework/library/Zeed/Util/Zh.php
2026-01-07 11:40:41 +08:00

331 lines
11 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
/**
* iNewS Project
*
* LICENSE
*
* http://www.inews.com.cn/license/inews
*
* @category iNewS
* @package ChangeMe
* @subpackage ChangeMe
* @copyright Copyright (c) 2008 Zeed Technologies PRC Inc. (http://www.inews.com.cn)
* @author xSharp ( GTalk: xSharp@gmail.com )
* @since Jul 3, 2008
* @version SVN: $Id: Zh.php 8996 2010-12-21 08:34:44Z xsharp $
*/
/**
* 中文字串(GBK)转成拼音
*/
class Zeed_Util_Zh
{
private static $_PinyinTable = null;
/**
* 汉字转成拼音
*
* @param string $string 汉字串
* @param boolean $permutationAndCombination 是否进行排列组合
* @return string|array
*/
public static function pinyin($string, $permutationAndCombination = false)
{
if (is_null(self::$_PinyinTable)) {
self::$_PinyinTable = include ZEED_PATH_3rd . '/PinyinTable.inc';
}
$flow = array();
for($i = 0; $i < strlen($string); $i ++) {
if (ord($string[$i]) >= 0x81 and ord($string[$i]) <= 0xfe) {
$h = ord($string[$i]);
if (isset($string[$i + 1])) {
$i ++;
$l = ord($string[$i]);
if (isset(self::$_PinyinTable[$h][$l])) {
array_push($flow, self::$_PinyinTable[$h][$l]);
} else {
array_push($flow, $h);
array_push($flow, $l);
}
} else {
array_push($flow, ord($string[$i]));
}
} else {
array_push($flow, ord($string[$i]));
}
}
$pinyin = '';
if (count($flow) > 0) {
if (! $permutationAndCombination) {
foreach ($flow as $val) {
if (is_array($val)) {
$pinyin .= ucfirst($val[0]);
} else {
$pinyin .= chr($val);
}
}
} else {
foreach ($flow as $key => $val) {
if (! is_array($val)) {
$flow[$key] = array(
chr($val));
} else {
$flow[$key] = array_map('ucfirst', $val);
}
}
$pc = Zeed_Util_Zh::permutationAndCombination($flow);
$pinyin = array();
foreach ($pc as $p) {
$pinyin[] = implode('', $p);
}
}
}
return $pinyin;
}
/**
* 将指定的字符串转为GBK
*
* 编码 代码页 简介
* GB2312 CP20936 收录文字6763个(简体中文)
* GBK CP936 收录文字21003个(包括简体、繁体、日文、朝鲜文。兼容GB2312。)
* GB18030 CP54936 收录文字27533个(包括简体、繁体、少数民族文字日文、朝鲜文。兼容GBK不兼容BIG5。)
* BIG5 CP950 收录文字13053个(繁体中文)
* BIG-5
* 《Unicode、GB2312、GBK和GB18030中的汉字》: http://www.fmddlmyy.cn/text24.html
*
* @param string $string
* @return string
*/
public static function convert2GBK($string)
{
if (NULL != $cs = mb_detect_encoding($string, array(
'UTF-8',
'GBK',
'BIG5'))) {
if ($cs != 'CP936' || $cs != 'CP54936') {
$string = iconv($cs, 'GBK', $string);
}
}
return $string;
}
/**
* 判断中文字符集(UTF-8/GBK/BIG5)
*
* http://php.net/manual/en/mbstring.supported-encodings.php
* http://www.gnu.org/software/libiconv/
* http://www.unicode.org/
* http://www.fmddlmyy.cn/text24.html
*
* @param string $string
* @return unknown
*/
public static function isUGB($string)
{
if (function_exists('mb_detect_encoding')) {
if (NULL != $cs = mb_detect_encoding($string, array(
'UTF-8',
'BIG5',
'GBK'))) {
switch($cs) {
case 'CP20936' :
return 'GBK';
case 'CP936' :
return 'GBK';
case 'CP54936' :
return 'GBK';
case 'CP950' :
return 'BIG5';
case 'BIG-5' :
return 'BIG5';
case 'UTF-8' :
return 'UTF-8';
default :
return $cs;
}
}
return NULL;
}
return self::_isUGB($string);
}
/**
* 编码 第一字节 第二字节
* GB2312 0xB0-0xF7(176-247) 0xA0-0xFE(160-254)
* GBK 0x81-0xFE(129-254) 0x40-0xFE(64-254)
* BIG5 0x81-0xFE(129-255) 0x40-0x7E(64-126),0xA10xFE(161-254)
*
* 一般是这样辨别GBK/BIG5的
* 1、GBK的内码的两个字节都是从A0H-FEH之间的
* 2、BIG5的内码的第一个字节是80H-FFH第二个字节是00H-FFH
*
* @param string $strtext
* @return string 返回:UTF-8/GBK/BIG5/null
*/
public static function _isUGB($string)
{
$UGB = null;
$length = strlen($string);
for($i = 0; $i < $length; $i ++) {
if (($ch1 = ord($string[$i])) > 0xE0) {
// UTF-8
return "UTF-8";
} elseif ($ch1 >= 0x81) {
// 中文
$ch2 = ord($string[$i + 1]);
/**
if ($ch1 >= 0xB0 && $ch1 <= 0xF7 && $ch2 >= 0xA0 && $ch2 <= 0xFE) { // GB2312
$GB2312found = true;
} else {
$GB2312found = false;
}
*/
if ($ch1 >= 0x81 && $ch1 <= 0xFE && (($ch2 >= 0x40 && $ch2 <= 0x7E) || ($ch2 >= 0xA1 && $ch2 <= 0xFE))) { //BIG5
$BIG5found = true;
} else {
$BIG5found = false;
}
if ($ch1 >= 0x81 && $ch1 <= 0xFE && $ch2 >= 0x40 && $ch2 <= 0xFE) { // GBK
$GBKfound = true;
} else {
$GBKfound = false;
}
if ($BIG5found && $GBKfound) {
if ($ch1 > 0xA0 && $ch1 < 0xFE && $ch2 > 0xA0 && $ch2 < 0xFE) { // GBK汉字两个字节都是从A0H-FEH之间的
$UGB = 'GBK';
}
if ($ch2 < 0x7F) { // 看第二个字节是否小于0x7F如果是的的话一般是BIG5。
return 'BIG5';
}
// 检查下一个字
$i ++;
continue;
} else {
return $BIG5found ? 'BIG5' : 'GBK';
}
}
}
return $UGB;
}
/**
* 废弃
*/
private static function _isUGB_deprecated($strtext)
{
$UGB = NULL;
$length = strlen($strtext);
for($i = 0; $i < $length; $i ++) {
//先判断UTF-8UTF-8的是三个字节, GBKBIG5是两个字节,可以分离
if (ord(substr($strtext, $i)) > 0xE0) {
$UGB = "UTF-8";
break;
} elseif (ord(substr($strtext, $i)) > 0xA1) {
$UGB = "BIG5";
break;
} elseif (ord(substr($strtext, $i)) > 0x80) {
$UGB = "GBK";
break;
}
}
return $UGB;
}
/**
* From http://w3.org/International/questions/qa-forms-utf-8.html
*/
public static function isUTF8($string)
{
return preg_match('%^(?:
[\x09\x0A\x0D\x20-\x7E] # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*$%xs', $string);
}
/**
* 要解决的数学问题算出C(a,1) * C(b, 1) * ... * C(n, 1)的组合情况其中C(n, 1)代表从n个元素里任意取一个元素
*
* 要解决的实际问题样例某年级有m个班级每个班的人数不同现在要从每个班里抽选一个人组成一个小组
* 由该小组来代表该年级参加学校的某次活动,请给出所有可能的组合
*
* 需要进行排列组合的数组
* 数组说明:该数组是一个二维数组,第一维索引代表班级编号,第二维索引代表学生编号
*
* @param array $CombinList 二维数组
* @return array
*/
public static function permutationAndCombination($CombinList)
{
/*
$CombinList = array(
1 => array(
"Student10",
"Student11"),
2 => array(
"Student20",
"Student21",
"Student22"),
3 => array(
"Student30"),
4 => array(
"Student40",
"Student41",
"Student42",
"Student43"));
*/
/* 计算C(a,1) * C(b, 1) * ... * C(n, 1)的值 */
$CombineCount = 1;
foreach ($CombinList as $Value) {
$CombineCount *= count($Value);
}
$Result = array();
$RepeatTime = $CombineCount;
foreach ($CombinList as $ClassNo => $StudentList) {
// $StudentList中的元素在拆分成组合后纵向出现的最大重复次数
$RepeatTime = $RepeatTime / count($StudentList);
$StartPosition = 1;
// 开始对每个班级的学生进行循环
foreach ($StudentList as $Student) {
$TempStartPosition = $StartPosition;
$SpaceCount = $CombineCount / count($StudentList) / $RepeatTime;
for($J = 1; $J <= $SpaceCount; $J ++) {
for($I = 0; $I < $RepeatTime; $I ++) {
$Result[$TempStartPosition + $I][$ClassNo] = $Student;
}
$TempStartPosition += $RepeatTime * count($StudentList);
}
$StartPosition += $RepeatTime;
}
}
return $Result;
}
}
// End ^ LF ^ UTF-8