做者|Marcellus Ruben
來源|Towards Datas Sciencepython
爲了可視化詞嵌入,咱們將使用常見的降維技術,如PCA和t-SNE。爲了將單詞映射到嵌入空間中的向量表示,咱們使用預訓練詞嵌入GloVe 。bash
pip install gensim
import pickle from gensim.test.utils import datapath, get_tmpfile from gensim.models import KeyedVectors from gensim.scripts.glove2word2vec import glove2word2vec glove_file = datapath('C:/Users/Desktop/glove.6B.100d.txt') word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt") glove2word2vec(glove_file, word2vec_glove_file) model = KeyedVectors.load_word2vec_format(word2vec_glove_file) filename = 'glove2word2vec_model.sav' pickle.dump(model, open(filename, 'wb'))
import pickle filename = 'glove2word2vec_model.sav' model = pickle.load(open(filename, 'rb')) def append_list(sim_words, words): list_of_words = [] for i in range(len(sim_words)): sim_words_list = list(sim_words[i]) sim_words_list.append(words) sim_words_tuple = tuple(sim_words_list) list_of_words.append(sim_words_tuple) return list_of_words input_word = 'school' user_input = [x.strip() for x in input_word.split(',')] result_word = [] for words in user_input: sim_words = model.most_similar(words, topn = 5) sim_words = append_list(sim_words, words) result_word.extend(sim_words) similar_word = [word[0] for word in result_word] similarity = [word[1] for word in result_word] similar_word.extend(user_input) labels = [word[2] for word in result_word] label_dict = dict([(y,x+1) for x,y in enumerate(set(labels))]) color_map = [label_dict[x] for x in labels]
舉個例子,假設咱們想找出與「school」相關聯的5個最類似的單詞。所以,「school」將是咱們的輸入詞。咱們的結果是‘college’, ‘schools’, ‘elementary’, ‘students’, 和‘student’。
import plotly import numpy as np import plotly.graph_objs as go from sklearn.decomposition import PCA def display_pca_scatterplot_3D(model, user_input=None, words=None, label=None, color_map=None, topn=5, sample=10): if words == None: if sample > 0: words = np.random.choice(list(model.vocab.keys()), sample) else: words = [ word for word in model.vocab ] word_vectors = np.array([model[w] for w in words]) three_dim = PCA(random_state=0).fit_transform(word_vectors)[:,:3] # 對於2D,將three_dim變量改成two_dim,以下所示: # two_dim = PCA(random_state=0).fit_transform(word_vectors)[:,:2] data = [] count = 0 for i in range (len(user_input)): trace = go.Scatter3d( x = three_dim[count:count+topn,0], y = three_dim[count:count+topn,1], z = three_dim[count:count+topn,2], text = words[count:count+topn], name = user_input[i], textposition = "top center", textfont_size = 20, mode = 'markers+text', marker = { 'size': 10, 'opacity': 0.8, 'color': 2 } ) #對於2D,不是使用go.Scatter3d,咱們須要用go.Scatter並刪除變量z。另外,不要使用變量three_dim,而是使用前面聲明的變量(例如two_dim) data.append(trace) count = count+topn trace_input = go.Scatter3d( x = three_dim[count:,0], y = three_dim[count:,1], z = three_dim[count:,2], text = words[count:], name = 'input words', textposition = "top center", textfont_size = 20, mode = 'markers+text', marker = { 'size': 10, 'opacity': 1, 'color': 'black' } ) # 對於2D,不是使用go.Scatter3d,咱們須要用go.Scatter並刪除變量z。另外,不要使用變量three_dim,而是使用前面聲明的變量(例如two_dim) data.append(trace_input) # 配置佈局 layout = go.Layout( margin = {'l': 0, 'r': 0, 'b': 0, 't': 0}, showlegend=True, legend=dict( x=1, y=0.5, font=dict( family="Courier New", size=25, color="black" )), font = dict( family = " Courier New ", size = 15), autosize = False, width = 1000, height = 1000 ) plot_figure = go.Figure(data = data, layout = layout) plot_figure.show() display_pca_scatterplot_3D(model, user_input, similar_word, labels, color_map)
import plotly import numpy as np import plotly.graph_objs as go from sklearn.manifold import TSNE def display_tsne_scatterplot_3D(model, user_input=None, words=None, label=None, color_map=None, perplexity = 0, learning_rate = 0, iteration = 0, topn=5, sample=10): if words == None: if sample > 0: words = np.random.choice(list(model.vocab.keys()), sample) else: words = [ word for word in model.vocab ] word_vectors = np.array([model[w] for w in words]) three_dim = TSNE(n_components = 3, random_state=0, perplexity = perplexity, learning_rate = learning_rate, n_iter = iteration).fit_transform(word_vectors)[:,:3] # 對於2D,將three_dim變量改成two_dim,以下所示: # two_dim = TSNE(n_components = 2, random_state=0, perplexity = perplexity, learning_rate = learning_rate, n_iter = iteration).fit_transform(word_vectors)[:,:2] data = [] count = 0 for i in range (len(user_input)): trace = go.Scatter3d( x = three_dim[count:count+topn,0], y = three_dim[count:count+topn,1], z = three_dim[count:count+topn,2], text = words[count:count+topn], name = user_input[i], textposition = "top center", textfont_size = 20, mode = 'markers+text', marker = { 'size': 10, 'opacity': 0.8, 'color': 2 } ) # 對於2D,不是使用go.Scatter3d,咱們須要用go.Scatter並刪除變量z。另外,不要使用變量three_dim,而是使用前面聲明的變量(例如two_dim) data.append(trace) count = count+topn trace_input = go.Scatter3d( x = three_dim[count:,0], y = three_dim[count:,1], z = three_dim[count:,2], text = words[count:], name = 'input words', textposition = "top center", textfont_size = 20, mode = 'markers+text', marker = { 'size': 10, 'opacity': 1, 'color': 'black' } ) # 對於2D,不是使用go.Scatter3d,咱們須要用go.Scatter並刪除變量z。另外,不要使用變量three_dim,而是使用前面聲明的變量(例如two_dim) data.append(trace_input) # 配置佈局 layout = go.Layout( margin = {'l': 0, 'r': 0, 'b': 0, 't': 0}, showlegend=True, legend=dict( x=1, y=0.5, font=dict( family="Courier New", size=25, color="black" )), font = dict( family = " Courier New ", size = 15), autosize = False, width = 1000, height = 1000 ) plot_figure = go.Figure(data = data, layout = layout) plot_figure.show() display_tsne_scatterplot_3D(model, user_input, similar_word, labels, color_map, 5, 500, 10000)
由於咱們使用的是Scikit learn,因此咱們能夠參考文檔來找出這些參數的默認值。perplexity 的默認值是30,可是咱們能夠在5到50之間調整該值。學習率的默認值是300,可是咱們能夠在10到1000之間調整該值。最後,迭代次數的默認值是1000,但咱們能夠將該值調整爲250。咱們可使用slider屬性來建立這些參數值。
import streamlit as st dim_red = st.sidebar.selectbox( 'Select dimension reduction method', ('PCA','TSNE')) dimension = st.sidebar.selectbox( "Select the dimension of the visualization", ('2D', '3D')) user_input = st.sidebar.text_input("Type the word that you want to investigate. You can type more than one word by separating one word with other with comma (,)",'') top_n = st.sidebar.slider('Select the amount of words associated with the input words you want to visualize ', 5, 100, (5)) annotation = st.sidebar.radio( "Enable or disable the annotation on the visualization", ('On', 'Off')) if dim_red == 'TSNE': perplexity = st.sidebar.slider('Adjust the perplexity. The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity', 5, 50, (30)) learning_rate = st.sidebar.slider('Adjust the learning rate', 10, 1000, (200)) iteration = st.sidebar.slider('Adjust the number of iteration', 250, 100000, (1000))
import plotly import plotly.graph_objs as go import numpy as np import pickle import streamlit as st from sklearn.decomposition import PCA from sklearn.manifold import TSNE filename = 'glove2word2vec_model.sav' model = pickle.load(open(filename, 'rb')) def append_list(sim_words, words): list_of_words = [] for i in range(len(sim_words)): sim_words_list = list(sim_words[i]) sim_words_list.append(words) sim_words_tuple = tuple(sim_words_list) list_of_words.append(sim_words_tuple) return list_of_words def display_scatterplot_3D(model, user_input=None, words=None, label=None, color_map=None, annotation='On', dim_red = 'PCA', perplexity = 0, learning_rate = 0, iteration = 0, topn=0, sample=10): if words == None: if sample > 0: words = np.random.choice(list(model.vocab.keys()), sample) else: words = [ word for word in model.vocab ] word_vectors = np.array([model[w] for w in words]) if dim_red == 'PCA': three_dim = PCA(random_state=0).fit_transform(word_vectors)[:,:3] else: three_dim = TSNE(n_components = 3, random_state=0, perplexity = perplexity, learning_rate = learning_rate, n_iter = iteration).fit_transform(word_vectors)[:,:3] color = 'blue' quiver = go.Cone( x = [0,0,0], y = [0,0,0], z = [0,0,0], u = [1.5,0,0], v = [0,1.5,0], w = [0,0,1.5], anchor = "tail", colorscale = [[0, color] , [1, color]], showscale = False ) data = [quiver] count = 0 for i in range (len(user_input)): trace = go.Scatter3d( x = three_dim[count:count+topn,0], y = three_dim[count:count+topn,1], z = three_dim[count:count+topn,2], text = words[count:count+topn] if annotation == 'On' else '', name = user_input[i], textposition = "top center", textfont_size = 30, mode = 'markers+text', marker = { 'size': 10, 'opacity': 0.8, 'color': 2 } ) data.append(trace) count = count+topn trace_input = go.Scatter3d( x = three_dim[count:,0], y = three_dim[count:,1], z = three_dim[count:,2], text = words[count:], name = 'input words', textposition = "top center", textfont_size = 30, mode = 'markers+text', marker = { 'size': 10, 'opacity': 1, 'color': 'black' } ) data.append(trace_input) # 配置佈局 layout = go.Layout( margin = {'l': 0, 'r': 0, 'b': 0, 't': 0}, showlegend=True, legend=dict( x=1, y=0.5, font=dict( family="Courier New", size=25, color="black" )), font = dict( family = " Courier New ", size = 15), autosize = False, width = 1000, height = 1000 ) plot_figure = go.Figure(data = data, layout = layout) st.plotly_chart(plot_figure) def horizontal_bar(word, similarity): similarity = [ round(elem, 2) for elem in similarity ] data = go.Bar( x= similarity, y= word, orientation='h', text = similarity, marker_color= 4, textposition='auto') layout = go.Layout( font = dict(size=20), xaxis = dict(showticklabels=False, automargin=True), yaxis = dict(showticklabels=True, automargin=True,autorange="reversed"), margin = dict(t=20, b= 20, r=10) ) plot_figure = go.Figure(data = data, layout = layout) st.plotly_chart(plot_figure) def display_scatterplot_2D(model, user_input=None, words=None, label=None, color_map=None, annotation='On', dim_red = 'PCA', perplexity = 0, learning_rate = 0, iteration = 0, topn=0, sample=10): if words == None: if sample > 0: words = np.random.choice(list(model.vocab.keys()), sample) else: words = [ word for word in model.vocab ] word_vectors = np.array([model[w] for w in words]) if dim_red == 'PCA': two_dim = PCA(random_state=0).fit_transform(word_vectors)[:,:2] else: two_dim = TSNE(random_state=0, perplexity = perplexity, learning_rate = learning_rate, n_iter = iteration).fit_transform(word_vectors)[:,:2] data = [] count = 0 for i in range (len(user_input)): trace = go.Scatter( x = two_dim[count:count+topn,0], y = two_dim[count:count+topn,1], text = words[count:count+topn] if annotation == 'On' else '', name = user_input[i], textposition = "top center", textfont_size = 20, mode = 'markers+text', marker = { 'size': 15, 'opacity': 0.8, 'color': 2 } ) data.append(trace) count = count+topn trace_input = go.Scatter( x = two_dim[count:,0], y = two_dim[count:,1], text = words[count:], name = 'input words', textposition = "top center", textfont_size = 20, mode = 'markers+text', marker = { 'size': 25, 'opacity': 1, 'color': 'black' } ) data.append(trace_input) # 配置佈局 layout = go.Layout( margin = {'l': 0, 'r': 0, 'b': 0, 't': 0}, showlegend=True, hoverlabel=dict( bgcolor="white", font_size=20, font_family="Courier New"), legend=dict( x=1, y=0.5, font=dict( family="Courier New", size=25, color="black" )), font = dict( family = " Courier New ", size = 15), autosize = False, width = 1000, height = 1000 ) plot_figure = go.Figure(data = data, layout = layout) st.plotly_chart(plot_figure) dim_red = st.sidebar.selectbox( 'Select dimension reduction method', ('PCA','TSNE')) dimension = st.sidebar.selectbox( "Select the dimension of the visualization", ('2D', '3D')) user_input = st.sidebar.text_input("Type the word that you want to investigate. You can type more than one word by separating one word with other with comma (,)",'') top_n = st.sidebar.slider('Select the amount of words associated with the input words you want to visualize ', 5, 100, (5)) annotation = st.sidebar.radio( "Enable or disable the annotation on the visualization", ('On', 'Off')) if dim_red == 'TSNE': perplexity = st.sidebar.slider('Adjust the perplexity. The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity', 5, 50, (30)) learning_rate = st.sidebar.slider('Adjust the learning rate', 10, 1000, (200)) iteration = st.sidebar.slider('Adjust the number of iteration', 250, 100000, (1000)) else: perplexity = 0 learning_rate = 0 iteration = 0 if user_input == '': similar_word = None labels = None color_map = None else: user_input = [x.strip() for x in user_input.split(',')] result_word = [] for words in user_input: sim_words = model.most_similar(words, topn = top_n) sim_words = append_list(sim_words, words) result_word.extend(sim_words) similar_word = [word[0] for word in result_word] similarity = [word[1] for word in result_word] similar_word.extend(user_input) labels = [word[2] for word in result_word] label_dict = dict([(y,x+1) for x,y in enumerate(set(labels))]) color_map = [label_dict[x] for x in labels] st.title('Word Embedding Visualization Based on Cosine Similarity') st.header('This is a web app to visualize the word embedding.') st.markdown('First, choose which dimension of visualization that you want to see. There are two options: 2D and 3D.') st.markdown('Next, type the word that you want to investigate. You can type more than one word by separating one word with other with comma (,).') st.markdown('With the slider in the sidebar, you can pick the amount of words associated with the input word you want to visualize. This is done by computing the cosine similarity between vectors of words in embedding space.') st.markdown('Lastly, you have an option to enable or disable the text annotation in the visualization.') if dimension == '2D': st.header('2D Visualization') st.write('For more detail about each point (just in case it is difficult to read the annotation), you can hover around each points to see the words. You can expand the visualization by clicking expand symbol in the top right corner of the visualization.') display_pca_scatterplot_2D(model, user_input, similar_word, labels, color_map, annotation, dim_red, perplexity, learning_rate, iteration, top_n) else: st.header('3D Visualization') st.write('For more detail about each point (just in case it is difficult to read the annotation), you can hover around each points to see the words. You can expand the visualization by clicking expand symbol in the top right corner of the visualization.') display_pca_scatterplot_3D(model, user_input, similar_word, labels, color_map, annotation, dim_red, perplexity, learning_rate, iteration, top_n) st.header('The Top 5 Most Similar Words for Each Input') count=0 for i in range (len(user_input)): st.write('The most similar words from '+str(user_input[i])+' are:') horizontal_bar(similar_word[count:count+5], similarity[count:count+5]) count = count+top_n
$ streamlit run your_script_name.py