本文共 8847 字,大约阅读时间需要 29 分钟。
imdb = keras.datasets.imdbvocab_size = 10000 # 出现词频由高到低, 截取前10000个词组,其余按特殊字符处理index_from = 3 # 截取的单词和对应索引,向后平移3个单位(train_data, train_labels), (test_data, test_labels) = imdb.load_data( num_words = vocab_size, index_from = index_from) # 加载数据print(type(train_data))print(type(train_data[0]))
word_index = imdb.get_word_index()print(len(word_index))
word_index = { k:(v+3) for k, v in word_index.items()}
word_index[''] = 0word_index[' '] = 1word_index[' '] = 2word_index[' '] = 3
reverse_word_index = dict( [(value, key) for key, value in word_index.items()])
def decode_review(text_ids): return ' '.join( [reverse_word_index.get(word_id, "") for word_id in text_ids])decode_review(train_data[0])
max_length = 500
train_data = keras.preprocessing.sequence.pad_sequences( train_data, # list of list value = word_index[''], # 超出最大值的部分需填充的数据 padding = 'post', # post:在后填充; pre:在前填充 maxlen = max_length) # 处理段落的最大值 -若超出则阶段;若不足则填充;
test_data = keras.preprocessing.sequence.pad_sequences( test_data, # list of list value = word_index[''], # 超出最大值的部分需填充的数据 padding = 'post', # post:在后填充; pre:在前填充 maxlen = max_length) # 处理段落的最大值 -若超出则阶段;若不足则填充;
print(train_data[0])
def plot_learning_curves(history, label, epochs, min_value, max_value): data = { } data[label] = history.history[label] data['val_'+label] = history.history['val_'+label] pd.DataFrame(data).plot(figsize=(8, 5)) plt.grid(True) plt.axis([0, epochs, min_value, max_value]) plt.show()
embedding_dim = 16 # 每个word都embedding为一个长度为16的向量batch_size = 512
# model = keras.models.Sequential([# # 1. define matrix: [vocab_size, embedding_dim]# # 2. [1,2,3,4..], max_length * embedding_dim# # 3. batch_size * max_length * embedding_dim# keras.layers.Embedding(vocab_size, embedding_dim,# input_length = max_length),# keras.layers.SimpleRNN(units = 64, return_sequences = False),# keras.layers.Dense(64, activation = 'relu'),# keras.layers.Dense(1, activation='sigmoid'),# ])# model.summary()# model.compile(optimizer = 'adam',# loss = 'binary_crossentropy',# metrics = ['accuracy'])
# model = keras.models.Sequential([# # 1. define matrix: [vocab_size, embedding_dim]# # 2. [1,2,3,4..], max_length * embedding_dim# # 3. batch_size * max_length * embedding_dim# keras.layers.Embedding(vocab_size, embedding_dim,# input_length = max_length),# keras.layers.Bidirectional(# keras.layers.SimpleRNN(# units = 64, return_sequences = True)),# keras.layers.Bidirectional(# keras.layers.SimpleRNN(# units = 64, return_sequences = False)),# keras.layers.Dense(64, activation = 'relu'),# keras.layers.Dense(1, activation='sigmoid'),# ])# model.summary()# model.compile(optimizer = 'adam',# loss = 'binary_crossentropy',# metrics = ['accuracy'])
model = keras.models.Sequential([ # 1. define matrix: [vocab_size, embedding_dim] # 2. [1,2,3,4..], max_length * embedding_dim # 3. batch_size * max_length * embedding_dim keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length), keras.layers.Bidirectional( keras.layers.SimpleRNN( units = 32, return_sequences = False)), keras.layers.Dense(32, activation = 'relu'), keras.layers.Dense(1, activation='sigmoid'),])model.summary()model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
history_single_rnn = model.fit( train_data, train_labels, epochs = 10, batch_size = batch_size, validation_split = 0.2)
plot_learning_curves(history_single_rnn, 'accuracy', 30, 0, 1)plot_learning_curves(history_single_rnn, 'loss', 30, 0, 1)
model.evaluate( test_data, test_labels, batch_size = batch_size, verbose = 0)
#!/usr/bin/env python3# -*- coding: utf-8 -*-"""Created on Thu Mar 4 10:52:02 2021@author: nijiahui"""import matplotlib as mplimport matplotlib.pyplot as pltimport numpy as npimport sklearnimport pandas as pdimport osimport sysimport timeimport tensorflow as tffrom tensorflow import kerasprint(tf.__version__)print(sys.version_info)for module in mpl, np, pd, sklearn, tf, keras: print(module.__name__, module.__version__) # 一,从keras数据集imdb中加载影评数据imdb = keras.datasets.imdbvocab_size = 10000 # 出现词频由高到低, 截取前10000个词组,其余按特殊字符处理index_from = 3 # 截取的单词和对应索引,向后平移3个单位(train_data, train_labels), (test_data, test_labels) = imdb.load_data( num_words = vocab_size, index_from = index_from) # 加载数据print(type(train_data))print(type(train_data[0]))# 二,拿到数据集索引和文字的对应关系并做预处理# 1,拿到数据集索引和文字的对应关系word_index = imdb.get_word_index()print(len(word_index))# 2,将数据集索引和文字的对应关系中的索引平移3个单位word_index = { k:(v+3) for k, v in word_index.items()}# 3,将平移后空置出的3个位置写入数值word_index[''] = 0word_index[' '] = 1word_index[' '] = 2word_index[' '] = 3# 4,翻转数据集索引和文字的对应关系reverse_word_index = dict( [(value, key) for key, value in word_index.items()])# 5,随意取出一条数据测试效果def decode_review(text_ids): return ' '.join( [reverse_word_index.get(word_id, " ") for word_id in text_ids])decode_review(train_data[0])# 三,对训练集,测试集做预处理处理max_length = 500# 1,对训练集数据做预处理处理train_data = keras.preprocessing.sequence.pad_sequences( train_data, # list of list value = word_index[' '], # 超出最大值的部分需填充的数据 padding = 'post', # post:在后填充; pre:在前填充 maxlen = max_length) # 处理段落的最大值 -若超出则阶段;若不足则填充;# 2,测试集数据做预处理处理test_data = keras.preprocessing.sequence.pad_sequences( test_data, # list of list value = word_index[' '], # 超出最大值的部分需填充的数据 padding = 'post', # post:在后填充; pre:在前填充 maxlen = max_length) # 处理段落的最大值 -若超出则阶段;若不足则填充;# 3,打印处理后的数据print(train_data[0])# 四,打印模型训练曲线def plot_learning_curves(history, label, epochs, min_value, max_value): data = { } data[label] = history.history[label] data['val_'+label] = history.history['val_'+label] pd.DataFrame(data).plot(figsize=(8, 5)) plt.grid(True) plt.axis([0, epochs, min_value, max_value]) plt.show()# 五,定义RNN模型embedding_dim = 16 # 每个word都embedding为一个长度为16的向量batch_size = 512# # 1,单层单向RNN(single_rnn_model)# model = keras.models.Sequential([# # 1. define matrix: [vocab_size, embedding_dim]# # 2. [1,2,3,4..], max_length * embedding_dim# # 3. batch_size * max_length * embedding_dim# keras.layers.Embedding(vocab_size, embedding_dim,# input_length = max_length),# keras.layers.SimpleRNN(units = 64, return_sequences = False),# keras.layers.Dense(64, activation = 'relu'),# keras.layers.Dense(1, activation='sigmoid'),# ])# model.summary()# model.compile(optimizer = 'adam',# loss = 'binary_crossentropy',# metrics = ['accuracy'])# # 2,多层双向RNN# model = keras.models.Sequential([# # 1. define matrix: [vocab_size, embedding_dim]# # 2. [1,2,3,4..], max_length * embedding_dim# # 3. batch_size * max_length * embedding_dim# keras.layers.Embedding(vocab_size, embedding_dim,# input_length = max_length),# keras.layers.Bidirectional(# keras.layers.SimpleRNN(# units = 64, return_sequences = True)),# keras.layers.Bidirectional(# keras.layers.SimpleRNN(# units = 64, return_sequences = False)),# keras.layers.Dense(64, activation = 'relu'),# keras.layers.Dense(1, activation='sigmoid'),# ])# model.summary()# model.compile(optimizer = 'adam',# loss = 'binary_crossentropy',# metrics = ['accuracy'])# 3,单层双向RNN(bi_rnn_model)model = keras.models.Sequential([ # 1. define matrix: [vocab_size, embedding_dim] # 2. [1,2,3,4..], max_length * embedding_dim # 3. batch_size * max_length * embedding_dim keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length), keras.layers.Bidirectional( keras.layers.SimpleRNN( units = 32, return_sequences = False)), keras.layers.Dense(32, activation = 'relu'), keras.layers.Dense(1, activation='sigmoid'),])model.summary()model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])# 六,训练模型history_single_rnn = model.fit( train_data, train_labels, epochs = 10, batch_size = batch_size, validation_split = 0.2)# 七,打印模型训练曲线plot_learning_curves(history_single_rnn, 'accuracy', 30, 0, 1)plot_learning_curves(history_single_rnn, 'loss', 30, 0, 1)# 八,估计器预测测试数据集准确率model.evaluate( test_data, test_labels, batch_size = batch_size, verbose = 0)
转载地址:http://evili.baihongyu.com/