第六章(1(1)
alltext = re.sub(r'\s', '', alltext)
seglist = list(jieba.cut(alltext, cut_all=False))
return seglist
def _build_vocab(filename):
data = readfile(filename)
counter = collections.Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
word_to_id = dict(zip(words, range(len(words))))
id_to_word = dict(zip(range(len(words)), words))
dataids = []
for w in
data:
dataids.append(word_to_id[w])
return word_to_id, id_to_word, dataids
def dataproducer(batch_size, num_steps, filename):
word_to_id, id_to_word, data = _build_vocab(filename)
datalen = len(data)
batchlen = datalen // batch_size
epcho_size = (batchlen - 1) // num_steps
data = tf.reshape(data[0: batchlen * batch_size], [batch_size, batchlen])
i = tf.train.range_input_producer(epcho_size, shuffle=False).dequeue()
x = tf.slice(data, [0, i * num_steps], [batch_size, num_steps])
y = tf.slice(data, [0, i * num_steps + 1], [batch_size, num_steps])
x.set_shape([batch_size, num_steps])
y.set_shape([batch_size, num_steps])
return x, y, id_to_word
lstm.py
#!/bash/bin
--coding=utf-8--
import tensorflow as tf
from data import *
import numpy as np
import random
def random_distribution():
"""Generate a random column of probabilities."""
b = np.random.uniform(0.0, 1.0, size=[1, vocab_size])
return b / np.sum(b, 1)[:, None]
def sample_distribution(distribution): # choose under the probabilities
"""Sample one element from a distribution assumed to be an array of normalized
probabilities.
"""
r = random.uniform(0, 1)
s = 0
for i in range(len(distribution[0])):
s += distribution[0][i]
if s >= r:
return i
return len(distribution) - 1
def sample(prediction):
d = sample_distribution(prediction)
re = []
re.append(d)
return re
模型参数设置
learning_rate = 1.0
num_steps = 35
hidden_size = 300
keep_prob = 1.0
lr_decay = 0.5
batch_size = 20
num_layers = 3
max_epoch = 14
语料文件
filename = 'novel.txt'
x, y, id_to_word = dataproducer(batch_size, num_steps, filename)
vocab_size = len(id_to_word)
size = hidden_size
建立 lstm 模型
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.5)
lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell], num_layers)
initial_state = cell.zero_state(batch_size, tf.float32)
state = initial_state
embedding = tf.get_variable('embedding', [vocab_size, size])
input_data = x
targets = y
test_input = tf.placeholder(tf.int32, shape=[1])
test_initial_state = cell.zero_state(1, tf.float32)
inputs = tf.nn.embedding_lookup(embedding, input_data)
test_inputs = tf.nn.embedding_lookup(embedding, test_input)
outputs = []
initializer = tf.random_uniform_initializer(-0.1, 0.1)
根据训练数据输出误差反向调整模型,tensorflow 主要通过变量空间来实现共享变量
with tf.variable_scope("Model", reuse=None, initializer=initializer):
with tf.variable_scope("r", reuse=None, initializer=initializer):
softmax_w = tf.get_variable('softmax_w', [size, vocab_size])
softmax_b = tf.get_variable('softmax_b', [vocab_size])
with tf.variable_scope("RNN", reuse=None, initializer=initializer):
for time_step in range(num_steps):
if time_step > 0: tf.get_variable_scope().reuse_variables()
(cell_output, state) = cell(inputs[:, time_step, :], state, )
outputs.append(cell_output)
output = tf.reshape(outputs, [-1, size])
logits = tf.matmul(output, softmax_w) + softmax_b
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(targets, [-1])], [tf.ones([batch_size * num_steps])])
global_step = tf.Variable(0)
learning_rate = tf.train.exponential_decay(
10.0, global_step, 5000, 0.1, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
cost = tf.reduce_sum(loss) / batch_size
预测新一轮输出
teststate = test_initial_state
(celloutput, teststate) = cell(test_inputs, teststate)
partial_logits = tf.matmul(celloutput, softmax_w) + softmax_b
partial_logits = tf.nn.softmax(partial_logits)
根据之前建立的操作,运行 tensorflow 会话
sv = tf.train.Supervisor(logdir=None)
with sv.managed_session() as session:
costs = 0
iters = 0
for i in range(100000):
_, l = session.run([optimizer, cost])
costs += l
iters += num_steps
perplextity = np.exp(costs / iters)
if i % 20 == 0:
print(perplextity)
if i % 100 == 0:
p = random_distribution()
b = sample(p)
sentence = id_to_word[b[0]]
评论