import tensorflow as tf
from tensorflow import keras
import numpy as np
print(tf.__version__)
複製代碼
1.12.0
複製代碼
imdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
複製代碼
print(f"Training entries: {len(train_data)}, labels: {len(train_labels)}")
print(f'train_data 第一個數據:\n{train_data[0]}')
len(train_data[0]), len(train_data[1])
複製代碼
Training entries: 25000, labels: 25000
train_data 第一個數據:
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
(218, 189)
複製代碼
能夠看到,每一行數據的長度並不相同,這會形成輸入張量的長度不一。python
下面咱們須要將 index(數字) 對應的字映射回去。數組
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()
# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2 # unknown
word_index["<UNUSED>"] = 3
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_review(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
複製代碼
影評(整數數組)必須轉換爲張量,而後才能饋送到神經網絡中。能夠經過如下兩種方法實現這種轉換:網絡
對數組進行獨熱編碼,將它們轉換爲由 0 和 1 構成的向量。例如,序列 [3, 5]
將變成一個 10000 維的向量,除索引 3 和 5 轉換爲 1 以外,其他全轉換爲 0。而後,將它做爲網絡的第一層,一個能夠處理浮點向量數據的密集層。不過,這種方法會佔用大量內存,須要一個大小爲 num_words * num_reviews
的矩陣。app
或者,咱們能夠填充數組,使它們都具備相同的長度,而後建立一個形狀爲 max_length * num_reviews
的整數張量。咱們能夠使用一個可以處理這種形狀的嵌入層做爲網絡中的第一層。post
此處使用第二種方法,使用 keras.preprocessing.sequence.pad_sequences編碼
# 最大長度256,使用`<PAD>`對應的 index 填充,填充在字符串後
train_data = keras.preprocessing.sequence.pad_sequences(train_data,value=word_index["<PAD>"],padding='post',maxlen=256)
test_data = keras.preprocessing.sequence.pad_sequences(test_data,value=word_index["<PAD>"],padding='post',maxlen=256)
複製代碼
print(f'train_data 第一個數據:\n{train_data[0]}')
len(train_data[0]), len(train_data[1])
複製代碼
train_data 第一個數據:
[ 1 14 22 16 43 530 973 1622 1385 65 458 4468 66 3941
4 173 36 256 5 25 100 43 838 112 50 670 2 9
35 480 284 5 150 4 172 112 167 2 336 385 39 4
172 4536 1111 17 546 38 13 447 4 192 50 16 6 147
2025 19 14 22 4 1920 4613 469 4 22 71 87 12 16
43 530 38 76 15 13 1247 4 22 17 515 17 12 16
626 18 2 5 62 386 12 8 316 8 106 5 4 2223
5244 16 480 66 3785 33 4 130 12 16 38 619 5 25
124 51 36 135 48 25 1415 33 6 22 12 215 28 77
52 5 14 407 16 82 2 8 4 107 117 5952 15 256
4 2 7 3766 5 723 36 71 43 530 476 26 400 317
46 7 4 2 1029 13 104 88 4 381 15 297 98 32
2071 56 26 141 6 194 7486 18 4 226 22 21 134 476
26 480 5 144 30 5535 18 51 36 28 224 92 25 104
4 226 65 16 38 1334 88 12 16 283 5 16 4472 113
103 32 15 16 5345 19 178 32 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]
(256, 256)複製代碼