字符串hash

時間 2021-02-01

標籤 ios git 算法函數優化 spa .net code htm blog 欄目 iOS 简体版

原文原文鏈接

字符串 hash

1.主要思想，將字符轉化爲一個權值

主要的處理方式有ios

隨機加
隨機乘
隨機取模

主要用的方式把一整個字符串當作一個b進制(本身規定)的數將這個數轉化爲該字符串的hash值，這樣匹配的時候，能夠將要求匹配的數也轉換爲hash值，而後就能夠輕鬆作到$O(1)$的查詢，可是因爲預處理是$O(n + m)$的，整體的複雜度仍然爲$O(n + m)$git

注意：

hash的時候是有可能出錯的，因此hash是一個不完美算法，~~可是算法競賽能寫幾分算幾分能騙幾分算幾分啊，數據水不就NB了嗎~~一本通有說若是$mod$ 的數爲$1e9+7 ||1e9+9$幾乎不可能發生衝突,緣由是這兩個數是孿生質數

主要流程

對於一個字符串 $s$ 定義一個hash函數對其映射，選取兩個合適的質數 $b~~ h~~(b~<~h)$字符長度爲len

\[hash(s) = ( \sum\limits_{i = 1}^{len} s_i \times b^{len - i})~~mod~~h \]

設$hash(s,k)$爲前$k$個字符構成的hash值，遞推獲得

\[hash(s,k+1) = hash(s,k) \times b + s_{k+1} \]

若是匹配子串是否相等能夠運用前綴和的思想,定義$c$爲位置從$k+1$開始，長度爲$n$的子串,子串的hash值能夠獲得爲

\[hash(c) = hash(c_{k + n}) - hash(c_k) \times b ^n \]

爲了優化複雜度，咱們要對$b^?$進行預處理（僅僅是爲了應對多組數據）
上面的是一種較爲優秀的hash函數構造方式，先乘再取模

code

pow[0] = 1;
for(int i = 1 ; i <= len ;i++) 
    pow[i]=  pow[i - 1] * b;

由於hash算法會出鍋，因此若是容許能夠的話寫雙hahs,通常雙hash就卡不掉了（LB說的)，可是會慢
例題
code

#include <iostream>
#include <cstdio>
#include <cmath>
#include <cstring>
#include <string>
#define ll long long  
#define ull unsigned long long  

using namespace std;
const int N = 1e6+100;
const int b = 1e9+7;
const int mod = 1e9+9;
int read() {
    int s = 0 , f = 0 ; char ch = getchar() ;
    while(!isdigit(ch)) f |= (ch == '-') , ch = getchar();
    while(isdigit(ch)) s = s * 10 + (ch ^ 48) , ch = getchar();
    return f ? -s : s;
}

char s1[N],s2[N];
ull sum[N];

ll q_pow(ll x, ll y) {
    ll ans = 1 ;
    while(y) {
        if(y&1) ans =(ans * x) % mod;
        x = (x * x) % mod , y >>= 1;
    }
    return  ans % mod;
}
int main() {
    scanf("%s%s",s1+1,s2+1);
    int len1 = strlen(s1 + 1), len2 =  strlen(s2+ 1);
    ll pow = q_pow(b,len2);
    sum[0] = 0;
    for(int i = 1 ; i <= len1 ;i++) 
        sum[i] = ((sum[i - 1] * b) % mod + s1[i]) % mod;
    ull  val = 0 ;
    for(int i = 1 ; i <= len2 ;i++) 
        val = ((val * b)% mod + s2[i] ) % mod ;
    int ans = 0 ;
    for(int i = 0 ; i <= len1 - len2 ;i++) {
        if(val == (sum[i + len2] - (sum[i] * pow) % mod + mod) % mod ) ans++;
    }
    printf("%d",ans);
    system("pause");
    return 0;
}

hash表

一開始我也不知道這是個啥神仙東西，並且一本通寫的是真的醜，理解了半天才看明白雙hash;

雙hash其實也不是難只要單hash明白了，圖論學的還OK那就徹底沒問題
首先我喜歡結構體存圖，因此我hash表是個結構體

int head[N] , num_h;
struct Hash{
    int diff,//different
    int net;//next
}hashs

第一個hash值爲x,加入第二個一個hash爲y的字符串

void add_h(int x ,int y) {
    hashs[++num_h].net = head[x];
    head[x] = num_h;
    hashs[num_h].diff = y;
}

匹配問題

bool query(int x,int y) {
    for(int i = head[x] ; i ; i = e[i].net) {
        if(hashs[i].diff == y) return true;
    }
    return false ;
}

例題算法

code

#include <iostream>
#include <cstdio>
#include <cmath>
#include <cstring>
#include <string>
#define ll long long
using namespace std;
const int mod1 = 1e6+9;
const int b1 = 47;
const int mod2 = 1e6+6;
const int b2 = 79;
const int N = 3e6+100;

int read() {
    int s = 0 , f = 0 ; char ch = getchar() ;
    while(!isdigit(ch)) f |= (ch == '-') , ch = getchar();
    while(isdigit(ch)) s = s * 10 + (ch ^ 48) , ch = getchar();
    return f ? -s : s;
}
char opt[300];
int head[N],num_h;
struct HASHH{
    int diff,net;
}hashs[N];
void add(int x, int y){
    hashs[++num_h].net = head[x];
    head[x] = num_h;
    hashs[num_h].diff = y;
    return ;
}
bool query(int x, int y) {
    for(int i = head[x] ; i ; i = hashs[i].net){
        if(hashs[i].diff == y) return 1;
    }
    return 0;
}
 int main() {
    int n = read();
    for(int i = 1  ; i <= n ;i++) {
        cin >> opt;
        if(opt[0] == 'a') {
            gets(opt);
            int len = strlen(opt),sum1 = 0, sum2 = 0;
            for(int i = 0 ; i < len ;i++) {
                sum1 = (sum1 * b1 + opt[i]) % mod1;
                sum2 = (sum2 * b2 + opt[i]) % mod2;
            }
            add(sum1,sum2);
        }else {
            gets(opt);
            int len = strlen(opt),sum1 = 0 , sum2 = 0;
            for(int i = 0 ; i < len ;i++) {
                sum1 = (sum1 * b1 + opt[i]) % mod1;
                sum2 = (sum2 * b2 + opt[i]) % mod2;
            } 
            if(query(sum1,sum2)) cout<<"yes\n";
            else cout <<"no\n";
        }
    }
    system("pause");
    return 0;
}

hash拼接題
這個題最重要的一點就是如何拼接兩個不連續的子串的hash值
根據原式子，
設原來的字符串爲$ABC$，刪去B
原$hash$值爲$$(A \times b^2 + B \times b^1 + C \times b^0)%mod $$
去掉$B$後爲$$(A \times b^1 + C \times b^0) % mod$$
稍微一對比就會發現，獲得一個公式函數

令前面的字符串爲 $M~~~hash$值已知
令後面拼接的字符串爲$N~~~hash$值已知,長度爲$len$
那麼$MN$的$hash$值能夠獲得爲

\[hash[M] *Pow[len] + hash[N] \]

那麼這個題就很輕鬆就能夠解決了，可是公式寫的確實噁心優化

作法：

直接對字符串進行暴力hash，複雜度爲$O(N)$以$n/2+1$爲分界點，進行處理，關於$AAAAAAA$這種毒瘤數據須要以$mid$兩邊的答案進行比較對於這個比較有兩種很好的方式，我的推薦第二種，可是可能會被卡，第一種也可能會被卡（可能性也與第一種差很少），可是第二個方法更暴力（STL），可是好想也好寫spa

方法一：將左右兩邊的答案加到$string$類型的一個變量裏根據$==$直接判斷

方法二：將左右兩邊在字符串的hash值求出，進行判讀（隨緣看數據，能不能過）

寫寫hash是真的被卡噁心了.net

這個題中有一個很是優秀的性質：
若是發如今前一半中刪去其中一個數等於後一半的hash值那麼，在前一半中若是還有可能刪去另外一個等於後一半那麼這兩個串必定是相等的，一樣能夠推斷後半部分也有這個性質，進一步推動，若是將這個$for$分紅兩部分來寫，對於隨機數據能夠大大優化時間複雜度，甚至到不了$O(N)$可是若是卡的話仍是會到$O(N)$帶常數的一個級別，若是用方法二作，能夠近似爲常數不超過2，能夠說很是優秀

方法一：

#include <iostream>
#include <cstdio>
#include <cmath>
#include <map>
#define ll long long
using namespace std;
const int N = 2e6+100;
const int b = 1e9+7;
const int mod = 1e9 + 9;
int read() {
	int s = 0 , f = 0 ;
	char ch = getchar() ;
	while(!isdigit(ch)) f |= (ch == '-') , ch = getchar();
	while(isdigit(ch)) s = s * 10 + (ch ^ 48) , ch = getchar();
	return f ? -s : s;
}
char u[N];
ll hashs[N],Pow[N];
string ans1 ,temp,ans2;
int n ;
void prepare() {
	Pow[0] = 1 ;
	for(int i = 1 ; i <= 2e6 ; i++) Pow[i] = Pow[i - 1] * b % mod;
	return ;
}
void print() {
	// for(int i = 1 ; i <= 2e6 ; i++) cout << Pow[i] <<" ";puts("");
	for(int i = 1 ; i <= n ; i++) {
		cout << hashs[i]  <<" ";
	}
}
int main() {
	prepare();
	ans1 += '0';
	n = read();
	for(int i = 1 ; i <= n ; i++) {
		cin >> u[i];
		hashs[i] = ( hashs[i - 1] * b % mod + u[i] ) % mod;
	}
	int flag = 0 ,flag2 = 0,pos1 = 0, pos2 = 0;
	if( (n & 1) == 0) {
		cout <<"NOT POSSIBLE\n";
		return 0;
	} else {
		for(int i = 1 ; i <= n ; i++) {
			if(i <= n / 2 + 1) {
				if( flag == 0&&(( (hashs[n / 2 + 1] - hashs[i] * Pow[n / 2 + 1 - i] % mod + mod) % mod  + hashs[i - 1] * Pow[n / 2 + 1 - i] % mod+ mod ) % mod
				                == ( hashs[n] - hashs[n / 2 + 1] * Pow[n / 2] % mod + mod ) % mod) ) {
					pos1 = i,flag++;
					int cnt = 0;
					ans1.clear();
					for(int j = 1 ; cnt != n/2 ; j++) {
						if(j == pos1) continue;
						ans1 += u[j];
						cnt++;
					}
				}
			} else {
				if( flag2 == 0&&(( ( hashs[n] - hashs[i] * Pow[n - i] % mod + mod) % mod + (hashs[i - 1] - hashs[ n / 2 ] * Pow[i - 1 - n / 2 ] % mod + mod ) % mod * Pow[n - i] + mod ) % mod
				                 == ( hashs[n / 2] + mod ) % mod) ) {
					pos2 = i ,flag2++;
					int cnt = 0;
					for(int j = n/2+1 ; cnt != n/2 ; j++) {
						if(j == pos2) continue;
						ans2 += u[j];
						cnt++;
					}
				}
			}
		}
		if(flag + flag2 == 0&&(ans1 != ans2 || ans1 =="0" )) {
			cout <<"NOT POSSIBLE\n";
			return 0;
		}
		if(ans1 == ans2) {
			cout << ans1;

			return 0;
		}
		if(flag + flag2 > 1) {
			cout <<"NOT UNIQUE";
		} else {
			if(flag == 1) cout << ans1;
			else {
				cout << ans2;
			}
		}
		system("pause");
		return 0;
	}
}
/*
7
ABXCABC
*/

方法二：

#include <iostream>
#include <cstdio>
#include <cmath>
#include <map>
#define ll long long
using namespace std;
const int N = 2e6+100;
const int b = 1e9+3;
const int mod = 1e9 + 7;
int read() {
	int s = 0 , f = 0 ;
	char ch = getchar() ;
	while(!isdigit(ch)) f |= (ch == '-') , ch = getchar();
	while(isdigit(ch)) s = s * 10 + (ch ^ 48) , ch = getchar();
	return f ? -s : s;
}
char u[N];
ll hashs[N],Pow[N];
ll ans1, ans2;
int n ;
void prepare() {
	Pow[0] = 1 ;
	for(int i = 1 ; i <= 2e6 ; i++) Pow[i] = Pow[i - 1] * b % mod;
	return ;
}
void print() {
	// for(int i = 1 ; i <= 2e6 ; i++) cout << Pow[i] <<" ";puts("");
	for(int i = 1 ; i <= n ; i++) {
		cout << hashs[i]  <<" ";
	}
}
int main() {
	prepare();
	n = read();
	for(int i = 1 ; i <= n ; i++) {
		cin >> u[i];
		hashs[i] = ( hashs[i - 1] * b % mod + u[i] ) % mod;
	}
	int flag = 0 ,flag2 = 0,pos1 = 0, pos2 = 0;
	if( (n & 1) == 0) {
		cout <<"NOT POSSIBLE\n";
		return 0;
	} else {
		for(int i = 1 ; i <= n ; i++) {
			if(i <= n / 2 + 1) {
				if( flag == 0&&( (hashs[n / 2 + 1] - hashs[i] * Pow[n / 2 + 1 - i] % mod + mod) % mod  + hashs[i - 1] * Pow[n / 2 + 1 - i] % mod+ mod ) % mod
				                == ( hashs[n] - hashs[n / 2 + 1] * Pow[n / 2] % mod + mod ) % mod)  {
					flag++;
					pos1 = i;
					ans1 = ((hashs[n / 2 + 1] - hashs[i] * Pow[n / 2 + 1 - i] % mod + mod) % mod  + hashs[i - 1] * Pow[n / 2 + 1 - i] % mod+ mod ) % mod;
				}
			} else {
				if( flag2 == 0&&( ( ( hashs[n] - hashs[i] * Pow[n - i] % mod + mod) % mod + (hashs[i - 1] - hashs[ n / 2 ] * Pow[i - 1 - n / 2 ] % mod + mod ) % mod * Pow[n - i] + mod ) % mod
				                 == ( hashs[n / 2] + mod ) % mod) ) {
					flag2++;
					pos2 = i;
					ans2 = ( ( hashs[n] - hashs[i] * Pow[n - i] % mod + mod) % mod + (hashs[i - 1] - hashs[ n / 2 ] * Pow[i - 1 - n / 2 ] % mod + mod ) % mod * Pow[n - i] + mod ) % mod;
				}
			}
		}
	}
	if((flag + flag2 == 0)&& (ans1 != ans2 || ans1 + ans2 == 0)) {
		cout <<"NOT POSSIBLE\n";
		return 0;
	}
//	cout << ans1 <<" " << ans1;
	if(flag + flag2 > 1 && ans1 != ans2) {
		cout <<"NOT UNIQUE";
		return 0;
	} else {
		if(flag == 1) {
			int cnt = 0;
			for(int i = 1  ; cnt != n/2 ; i++ ) {
				if(i == pos1) continue;
				cout<< u[i];
				cnt++;
			}
			return 0;
		} else {
			int cnt = 0;
			for(int i = n/2+1  ; cnt != n/2 ; i++ ) {
				if(i == pos2) continue;
				cout<< u[i];
				cnt++;
			}
			return 0;
		}
	}
	if(ans1 == ans2 && ans1 + ans2  != 0) {
		int cnt = 0;
		for(int i = 1  ; cnt != n/2 ; i++ ) {
			if(i == pos1) continue;
			cout<< u[i];
			cnt++;
		}
		return 0;
	}
	system("pause");
	return 0;
}