PHP Big5 Utf-8 GB2312 相互轉碼解決辦法

時間 2020-01-08

標籤 php big5 big utf gb2312 相互解決辦法欄目 PHP 简体版

原文原文鏈接

編寫 PHP 代碼的過程當中，常常會遇到須要對中文轉碼的問題，如 GB2312 <=> Unicode、GB2312 <=> Big5 等等。若是 PHP 編譯時帶有 mbstring 的話，能夠使用 Multi-Byte String Function 實現部分轉碼工做。然而因爲不少虛擬主機不支持 mbstring，或者 mbstring 的編譯、配置過於麻煩，不少 PHP 代碼沒法使用這一序列的函數。 php

最近爲了解決這個問題，找到一個不錯的項目：PHP News Reader，這是一個基於 WEB 的新聞閱讀器，支持基於 NNTP (RFC 977) 協議的新聞文章的閱讀、發佈、刪除、回覆等功能。這個項目實現了 GB2312 Big5 Unicode(UTF-8) 之間的相互轉碼，這個正是我所關心的部分。 html

使用 CVS 客戶端（Linux 下直接用命令行就行，Windows 下推薦使用 Tortoise CVS）將項目的代碼 Check Out 出來：
ide

# cvs -d:pserver:anonymous@cvs.sourceforge.net:/cvsroot/pnews login
Logging in to :pserver:anonymous@cvs.sourceforge.net:2401/cvsroot/pnews
CVS password: (Press Enter)
# cvs -z3 -d:pserver:anonymous@cvs.sourceforge.net:/cvsroot/pnews co pnews
cvs server: Updating pnews
…

查看 pnews/language 目錄，此目錄下包含了以下文件：
函數

big5-gb.tab
big5-unicode.tab
gb-big5.tab
gb-unicode.tab
unicode-big5.tab
unicode-gb.tab

這些都是用於字符轉換的碼錶，而後再看看 pnews/language.inc.php 文件，其中包含了幾個用於編碼轉換的函數：
編碼

// Big5 => GB
function b2g( $instr ) {
	$fp = fopen( 'language/big5-gb.tab', 'r' );
	$len = strlen($instr);
	for( $i = 0 ; $i < $len ; $i++ ) {
		$h = ord($instr[$i]);
		if( $h >= 160 ) {
			$l = ord($instr[$i+1]);
			if( $h == 161 && $l == 64 )
				$gb = '  ';
			else {
				fseek( $fp, (($h-160)*255+$l-1)*3 );
				$gb = fread( $fp, 2 );
			}
			$instr[$i] = $gb[0];
			$instr[$i+1] = $gb[1];
			$i++;
		}
	}
	fclose($fp);
	return $instr;
}
// GB => BIG5
function g2b( $instr ) {
	$fp = fopen( 'language/gb-big5.tab', 'r' );
	$len = strlen($instr);
	for( $i = 0 ; $i < $len ; $i++ ) {
		$h = ord($instr[$i]);
		if( $h > 160 && $h < 248 ) {
			$l = ord($instr[$i+1]);
			if( $l > 160 && $l < 255 ) {
				fseek( $fp, (($h-161)*94+$l-161)*3 );
				$bg = fread( $fp, 2 );
			}
			else
				$bg = '  ';
			$instr[$i] = $bg[0];
			$instr[$i+1] = $bg[1];
			$i++;
		}
	}
	fclose($fp);
	return $instr;
}
// Big5 => Unicode(UtF-8)
function b2u( $instr ) {
	$fp = fopen( 'language/big5-unicode.tab', 'r' );
	$len = strlen($instr);
	$outstr = '';
	for( $i = $x = 0 ; $i < $len ; $i++ ) {
		$h = ord($instr[$i]);
		if( $h >= 160 ) {
			$l = ord($instr[$i+1]);
			if( $h == 161 && $l == 64 )
				$uni = '  ';
			else {
				fseek( $fp, ($h-160)*510+($l-1)*2 );
				$uni = fread( $fp, 2 );
			}
			$codenum = ord($uni[0])*256 + ord($uni[1]);
			if( $codenum < 0x800 ) {
				$outstr[$x++] = chr( 192 + $codenum / 64 );
				$outstr[$x++] = chr( 128 + $codenum % 64 );
#				printf("[%02X%02X]<br>n", ord($outstr[$x-2]), ord($uni[$x-1]) );
			}
			else {
				$outstr[$x++] = chr( 224 + $codenum / 4096 );
				$codenum %= 4096;
				$outstr[$x++] = chr( 128 + $codenum / 64 );
				$outstr[$x++] = chr( 128 + ($codenum % 64) );
#				printf("[%02X%02X%02X]<br>n", ord($outstr[$x-3]), ord($outstr[$x-2]), ord($outstr[$x-1]) );
			}
			$i++;
		}
		else
			$outstr[$x++] = $instr[$i];
	}
	fclose($fp);
	if( $instr != '' )
		return join( '', $outstr);
}
// Unicode(UTF-8) => BIG5
function u2b( $instr ) {
	$fp = fopen( 'language/unicode-big5.tab', 'r' );
	$len = strlen($instr);
	$outstr = '';
	for( $i = $x = 0 ; $i < $len ; $i++ ) {
		$b1 = ord($instr[$i]);
		if( $b1 < 0x80 ) {
			$outstr[$x++] = chr($b1);
#			printf( "[%02X]", $b1);
		}
		elseif( $b1 >= 224 ) {	# 3 bytes UTF-8
			$b1 -= 224;
			$b2 = ord($instr[$i+1]) - 128;
			$b3 = ord($instr[$i+2]) - 128;
			$i += 2;
			$uc = $b1 * 4096 + $b2 * 64 + $b3 ;
			fseek( $fp, $uc * 2 );
			$bg = fread( $fp, 2 );
			$outstr[$x++] = $bg[0];
			$outstr[$x++] = $bg[1];
#			printf( "[%02X%02X]", ord($bg[0]), ord($bg[1]));
		}
		elseif( $b1 >= 192 ) {	# 2 bytes UTF-8
			printf( "[%02X%02X]", $b1, ord($instr[$i+1]) );
			$b1 -= 192;
			$b2 = ord($instr[$i]) - 128;
			$i++;
			$uc = $b1 * 64 + $b2 ;
			fseek( $fp, $uc * 2 );
			$bg = fread( $fp, 2 );
			$outstr[$x++] = $bg[0];
			$outstr[$x++] = $bg[1];
#			printf( "[%02X%02X]", ord($bg[0]), ord($bg[1]));
		}
	}
	fclose($fp);
	if( $instr != '' ) {
#		echo '##' . $instr . " becomes " . join( '', $outstr) . "<br>n";
		return join( '', $outstr);
	}
}
// GB => Unicode(UTF-8)
function g2u( $instr ) {
	$fp = fopen( 'language/gb-unicode.tab', 'r' );
	$len = strlen($instr);
	$outstr = '';
	for( $i = $x = 0 ; $i < $len ; $i++ ) {
		$h = ord($instr[$i]);
		if( $h > 160 ) {
			$l = ord($instr[$i+1]);
			fseek( $fp, ($h-161)*188+($l-161)*2 );
			$uni = fread( $fp, 2 );
			$codenum = ord($uni[0])*256 + ord($uni[1]);
			if( $codenum < 0x800 ) {
				$outstr[$x++] = chr( 192 + $codenum / 64 );
				$outstr[$x++] = chr( 128 + $codenum % 64 );
#				printf("[%02X%02X]<br>n", ord($outstr[$x-2]), ord($uni[$x-1]) );
			}
			else {
				$outstr[$x++] = chr( 224 + $codenum / 4096 );
				$codenum %= 4096;
				$outstr[$x++] = chr( 128 + $codenum / 64 );
				$outstr[$x++] = chr( 128 + ($codenum % 64) );
#				printf("[%02X%02X%02X]<br>n", ord($outstr[$x-3]), ord($outstr[$x-2]), ord($outstr[$x-1]) );
			}
			$i++;
		}
		else
			$outstr[$x++] = $instr[$i];
	}
	fclose($fp);
	if( $instr != '' )
		return join( '', $outstr);
}
// Unicode(UTF-8) => GB
function u2g( $instr ) {
	$fp = fopen( 'language/unicode-gb.tab', 'r' );
	$len = strlen($instr);
	$outstr = '';
	for( $i = $x = 0 ; $i < $len ; $i++ ) {
		$b1 = ord($instr[$i]);
		if( $b1 < 0x80 ) {
			$outstr[$x++] = chr($b1);
#			printf( "[%02X]", $b1);
		}
		elseif( $b1 >= 224 ) {	# 3 bytes UTF-8
			$b1 -= 224;
			$b2 = ord($instr[$i+1]) - 128;
			$b3 = ord($instr[$i+2]) - 128;
			$i += 2;
			$uc = $b1 * 4096 + $b2 * 64 + $b3 ;
			fseek( $fp, $uc * 2 );
			$gb = fread( $fp, 2 );
			$outstr[$x++] = $gb[0];
			$outstr[$x++] = $gb[1];
#			printf( "[%02X%02X]", ord($gb[0]), ord($gb[1]));
		}
		elseif( $b1 >= 192 ) {	# 2 bytes UTF-8
			printf( "[%02X%02X]", $b1, ord($instr[$i+1]) );
			$b1 -= 192;
			$b2 = ord($instr[$i]) - 128;
			$i++;
			$uc = $b1 * 64 + $b2 ;
			fseek( $fp, $uc * 2 );
			$gb = fread( $fp, 2 );
			$outstr[$x++] = $gb[0];
			$outstr[$x++] = $gb[1];
#			printf( "[%02X%02X]", ord($gb[0]), ord($gb[1]));
		}
	}
	fclose($fp);
	if( $instr != '' ) {
#		echo '##' . $instr . " becomes " . join( '', $outstr) . "<br>n";
		return join( '', $outstr);
	}
}

在本身的 PHP 文件中須要轉碼時，只須要 .tab 碼錶文件及相對應的轉碼函數，將函數中的 fopen 打開的文件路徑修改成正確的路徑便可。 .net