331 lines
11 KiB
PHP
331 lines
11 KiB
PHP
|
|
<?php
|
|||
|
|
/**
|
|||
|
|
* iNewS Project
|
|||
|
|
*
|
|||
|
|
* LICENSE
|
|||
|
|
*
|
|||
|
|
* http://www.inews.com.cn/license/inews
|
|||
|
|
*
|
|||
|
|
* @category iNewS
|
|||
|
|
* @package ChangeMe
|
|||
|
|
* @subpackage ChangeMe
|
|||
|
|
* @copyright Copyright (c) 2008 Zeed Technologies PRC Inc. (http://www.inews.com.cn)
|
|||
|
|
* @author xSharp ( GTalk: xSharp@gmail.com )
|
|||
|
|
* @since Jul 3, 2008
|
|||
|
|
* @version SVN: $Id: Zh.php 8996 2010-12-21 08:34:44Z xsharp $
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 中文字串(GBK)转成拼音
|
|||
|
|
*/
|
|||
|
|
class Zeed_Util_Zh
|
|||
|
|
{
|
|||
|
|
private static $_PinyinTable = null;
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 汉字转成拼音
|
|||
|
|
*
|
|||
|
|
* @param string $string 汉字串
|
|||
|
|
* @param boolean $permutationAndCombination 是否进行排列组合
|
|||
|
|
* @return string|array
|
|||
|
|
*/
|
|||
|
|
public static function pinyin($string, $permutationAndCombination = false)
|
|||
|
|
{
|
|||
|
|
if (is_null(self::$_PinyinTable)) {
|
|||
|
|
self::$_PinyinTable = include ZEED_PATH_3rd . '/PinyinTable.inc';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
$flow = array();
|
|||
|
|
for($i = 0; $i < strlen($string); $i ++) {
|
|||
|
|
if (ord($string[$i]) >= 0x81 and ord($string[$i]) <= 0xfe) {
|
|||
|
|
$h = ord($string[$i]);
|
|||
|
|
if (isset($string[$i + 1])) {
|
|||
|
|
$i ++;
|
|||
|
|
$l = ord($string[$i]);
|
|||
|
|
if (isset(self::$_PinyinTable[$h][$l])) {
|
|||
|
|
array_push($flow, self::$_PinyinTable[$h][$l]);
|
|||
|
|
} else {
|
|||
|
|
array_push($flow, $h);
|
|||
|
|
array_push($flow, $l);
|
|||
|
|
}
|
|||
|
|
} else {
|
|||
|
|
array_push($flow, ord($string[$i]));
|
|||
|
|
}
|
|||
|
|
} else {
|
|||
|
|
array_push($flow, ord($string[$i]));
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
$pinyin = '';
|
|||
|
|
if (count($flow) > 0) {
|
|||
|
|
if (! $permutationAndCombination) {
|
|||
|
|
foreach ($flow as $val) {
|
|||
|
|
if (is_array($val)) {
|
|||
|
|
$pinyin .= ucfirst($val[0]);
|
|||
|
|
} else {
|
|||
|
|
$pinyin .= chr($val);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
} else {
|
|||
|
|
foreach ($flow as $key => $val) {
|
|||
|
|
if (! is_array($val)) {
|
|||
|
|
$flow[$key] = array(
|
|||
|
|
chr($val));
|
|||
|
|
} else {
|
|||
|
|
$flow[$key] = array_map('ucfirst', $val);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
$pc = Zeed_Util_Zh::permutationAndCombination($flow);
|
|||
|
|
$pinyin = array();
|
|||
|
|
foreach ($pc as $p) {
|
|||
|
|
$pinyin[] = implode('', $p);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return $pinyin;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 将指定的字符串转为GBK
|
|||
|
|
*
|
|||
|
|
* 编码 代码页 简介
|
|||
|
|
* GB2312 CP20936 收录文字6763个(简体中文)
|
|||
|
|
* GBK CP936 收录文字21003个(包括简体、繁体、日文、朝鲜文。兼容GB2312。)
|
|||
|
|
* GB18030 CP54936 收录文字27533个(包括简体、繁体、少数民族文字,日文、朝鲜文。兼容GBK,不兼容BIG5。)
|
|||
|
|
* BIG5 CP950 收录文字13053个(繁体中文)
|
|||
|
|
* BIG-5
|
|||
|
|
* 《Unicode、GB2312、GBK和GB18030中的汉字》: http://www.fmddlmyy.cn/text24.html
|
|||
|
|
*
|
|||
|
|
* @param string $string
|
|||
|
|
* @return string
|
|||
|
|
*/
|
|||
|
|
public static function convert2GBK($string)
|
|||
|
|
{
|
|||
|
|
if (NULL != $cs = mb_detect_encoding($string, array(
|
|||
|
|
'UTF-8',
|
|||
|
|
'GBK',
|
|||
|
|
'BIG5'))) {
|
|||
|
|
if ($cs != 'CP936' || $cs != 'CP54936') {
|
|||
|
|
$string = iconv($cs, 'GBK', $string);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return $string;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 判断中文字符集(UTF-8/GBK/BIG5)
|
|||
|
|
*
|
|||
|
|
* http://php.net/manual/en/mbstring.supported-encodings.php
|
|||
|
|
* http://www.gnu.org/software/libiconv/
|
|||
|
|
* http://www.unicode.org/
|
|||
|
|
* http://www.fmddlmyy.cn/text24.html
|
|||
|
|
*
|
|||
|
|
* @param string $string
|
|||
|
|
* @return unknown
|
|||
|
|
*/
|
|||
|
|
public static function isUGB($string)
|
|||
|
|
{
|
|||
|
|
if (function_exists('mb_detect_encoding')) {
|
|||
|
|
if (NULL != $cs = mb_detect_encoding($string, array(
|
|||
|
|
'UTF-8',
|
|||
|
|
'BIG5',
|
|||
|
|
'GBK'))) {
|
|||
|
|
switch($cs) {
|
|||
|
|
case 'CP20936' :
|
|||
|
|
return 'GBK';
|
|||
|
|
case 'CP936' :
|
|||
|
|
return 'GBK';
|
|||
|
|
case 'CP54936' :
|
|||
|
|
return 'GBK';
|
|||
|
|
case 'CP950' :
|
|||
|
|
return 'BIG5';
|
|||
|
|
case 'BIG-5' :
|
|||
|
|
return 'BIG5';
|
|||
|
|
case 'UTF-8' :
|
|||
|
|
return 'UTF-8';
|
|||
|
|
default :
|
|||
|
|
return $cs;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return NULL;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return self::_isUGB($string);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 编码 第一字节 第二字节
|
|||
|
|
* GB2312 0xB0-0xF7(176-247) 0xA0-0xFE(160-254)
|
|||
|
|
* GBK 0x81-0xFE(129-254) 0x40-0xFE(64-254)
|
|||
|
|
* BIG5 0x81-0xFE(129-255) 0x40-0x7E(64-126),0xA1-0xFE(161-254)
|
|||
|
|
*
|
|||
|
|
* 一般是这样辨别GBK/BIG5的
|
|||
|
|
* 1、GBK的内码的两个字节都是从A0H-FEH之间的;
|
|||
|
|
* 2、BIG5的内码的第一个字节是80H-FFH,第二个字节是00H-FFH;
|
|||
|
|
*
|
|||
|
|
* @param string $strtext
|
|||
|
|
* @return string 返回:UTF-8/GBK/BIG5/null
|
|||
|
|
*/
|
|||
|
|
public static function _isUGB($string)
|
|||
|
|
{
|
|||
|
|
$UGB = null;
|
|||
|
|
$length = strlen($string);
|
|||
|
|
for($i = 0; $i < $length; $i ++) {
|
|||
|
|
if (($ch1 = ord($string[$i])) > 0xE0) {
|
|||
|
|
// UTF-8
|
|||
|
|
return "UTF-8";
|
|||
|
|
} elseif ($ch1 >= 0x81) {
|
|||
|
|
// 中文
|
|||
|
|
$ch2 = ord($string[$i + 1]);
|
|||
|
|
/**
|
|||
|
|
if ($ch1 >= 0xB0 && $ch1 <= 0xF7 && $ch2 >= 0xA0 && $ch2 <= 0xFE) { // GB2312
|
|||
|
|
$GB2312found = true;
|
|||
|
|
} else {
|
|||
|
|
$GB2312found = false;
|
|||
|
|
}
|
|||
|
|
*/
|
|||
|
|
if ($ch1 >= 0x81 && $ch1 <= 0xFE && (($ch2 >= 0x40 && $ch2 <= 0x7E) || ($ch2 >= 0xA1 && $ch2 <= 0xFE))) { //BIG5
|
|||
|
|
$BIG5found = true;
|
|||
|
|
} else {
|
|||
|
|
$BIG5found = false;
|
|||
|
|
}
|
|||
|
|
if ($ch1 >= 0x81 && $ch1 <= 0xFE && $ch2 >= 0x40 && $ch2 <= 0xFE) { // GBK
|
|||
|
|
$GBKfound = true;
|
|||
|
|
} else {
|
|||
|
|
$GBKfound = false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if ($BIG5found && $GBKfound) {
|
|||
|
|
if ($ch1 > 0xA0 && $ch1 < 0xFE && $ch2 > 0xA0 && $ch2 < 0xFE) { // GBK汉字两个字节都是从A0H-FEH之间的
|
|||
|
|
$UGB = 'GBK';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if ($ch2 < 0x7F) { // 看第二个字节是否小于0x7F,如果是的的话,一般是BIG5。
|
|||
|
|
return 'BIG5';
|
|||
|
|
}
|
|||
|
|
// 检查下一个字
|
|||
|
|
$i ++;
|
|||
|
|
continue;
|
|||
|
|
} else {
|
|||
|
|
return $BIG5found ? 'BIG5' : 'GBK';
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return $UGB;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 废弃
|
|||
|
|
*/
|
|||
|
|
private static function _isUGB_deprecated($strtext)
|
|||
|
|
{
|
|||
|
|
$UGB = NULL;
|
|||
|
|
$length = strlen($strtext);
|
|||
|
|
for($i = 0; $i < $length; $i ++) {
|
|||
|
|
//先判断UTF-8,UTF-8的是三个字节, GBK,BIG5是两个字节,可以分离
|
|||
|
|
if (ord(substr($strtext, $i)) > 0xE0) {
|
|||
|
|
$UGB = "UTF-8";
|
|||
|
|
break;
|
|||
|
|
} elseif (ord(substr($strtext, $i)) > 0xA1) {
|
|||
|
|
$UGB = "BIG5";
|
|||
|
|
break;
|
|||
|
|
} elseif (ord(substr($strtext, $i)) > 0x80) {
|
|||
|
|
$UGB = "GBK";
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return $UGB;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* From http://w3.org/International/questions/qa-forms-utf-8.html
|
|||
|
|
*/
|
|||
|
|
public static function isUTF8($string)
|
|||
|
|
{
|
|||
|
|
return preg_match('%^(?:
|
|||
|
|
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
|||
|
|
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
|||
|
|
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
|||
|
|
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
|||
|
|
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
|||
|
|
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
|||
|
|
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
|||
|
|
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
|||
|
|
)*$%xs', $string);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 要解决的数学问题:算出C(a,1) * C(b, 1) * ... * C(n, 1)的组合情况,其中C(n, 1)代表从n个元素里任意取一个元素
|
|||
|
|
*
|
|||
|
|
* 要解决的实际问题样例:某年级有m个班级,每个班的人数不同,现在要从每个班里抽选一个人组成一个小组,
|
|||
|
|
* 由该小组来代表该年级参加学校的某次活动,请给出所有可能的组合
|
|||
|
|
*
|
|||
|
|
* 需要进行排列组合的数组
|
|||
|
|
* 数组说明:该数组是一个二维数组,第一维索引代表班级编号,第二维索引代表学生编号
|
|||
|
|
*
|
|||
|
|
* @param array $CombinList 二维数组
|
|||
|
|
* @return array
|
|||
|
|
*/
|
|||
|
|
public static function permutationAndCombination($CombinList)
|
|||
|
|
{
|
|||
|
|
|
|||
|
|
/*
|
|||
|
|
$CombinList = array(
|
|||
|
|
1 => array(
|
|||
|
|
"Student10",
|
|||
|
|
"Student11"),
|
|||
|
|
2 => array(
|
|||
|
|
"Student20",
|
|||
|
|
"Student21",
|
|||
|
|
"Student22"),
|
|||
|
|
3 => array(
|
|||
|
|
"Student30"),
|
|||
|
|
4 => array(
|
|||
|
|
"Student40",
|
|||
|
|
"Student41",
|
|||
|
|
"Student42",
|
|||
|
|
"Student43"));
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
/* 计算C(a,1) * C(b, 1) * ... * C(n, 1)的值 */
|
|||
|
|
$CombineCount = 1;
|
|||
|
|
foreach ($CombinList as $Value) {
|
|||
|
|
$CombineCount *= count($Value);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
$Result = array();
|
|||
|
|
$RepeatTime = $CombineCount;
|
|||
|
|
foreach ($CombinList as $ClassNo => $StudentList) {
|
|||
|
|
// $StudentList中的元素在拆分成组合后纵向出现的最大重复次数
|
|||
|
|
$RepeatTime = $RepeatTime / count($StudentList);
|
|||
|
|
|
|||
|
|
$StartPosition = 1;
|
|||
|
|
|
|||
|
|
// 开始对每个班级的学生进行循环
|
|||
|
|
foreach ($StudentList as $Student) {
|
|||
|
|
$TempStartPosition = $StartPosition;
|
|||
|
|
|
|||
|
|
$SpaceCount = $CombineCount / count($StudentList) / $RepeatTime;
|
|||
|
|
|
|||
|
|
for($J = 1; $J <= $SpaceCount; $J ++) {
|
|||
|
|
for($I = 0; $I < $RepeatTime; $I ++) {
|
|||
|
|
$Result[$TempStartPosition + $I][$ClassNo] = $Student;
|
|||
|
|
}
|
|||
|
|
$TempStartPosition += $RepeatTime * count($StudentList);
|
|||
|
|
}
|
|||
|
|
$StartPosition += $RepeatTime;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return $Result;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// End ^ LF ^ UTF-8
|