MNIST란?
손으로 필기한 약 60000여개의 숫자 이미지 데이터셋. 이미지와 라벨이 따로 제공된다.
Softmax Classification
import tensorflow as tf
# Import MINST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
learning_rate = 0.1
training_epochs = 100
batch_size = 100
display_step = 1
x = tf.placeholder(tf.float32, [None, 784]) # MNIST data image of shape 28 * 28 = 784
y = tf.placeholder(tf.float32, [None, 10]) # 0-9 digits recognition => 10 classes
W = tf.Variable(tf.zeros([784,10]))
b = tf.Variable(tf.zeros([10]))
# Our hypothesis
hypothesis = tf.nn.softmax(tf.matmul(x, W) + b)
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(hypothesis), reduction_indices=1))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(mnist.train.num_examples/batch_size)
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
_, c = sess.run([optimizer, cost], feed_dict = {x: batch_xs,
y: batch_ys})
avg_cost += c / total_batch
if (epoch+1) % display_step == 0 :
print("Epoch:", "%04d" % (epoch+1) , "cost=", "{:.9f}".format(avg_cost))
print("Optimization Finished")
correct_prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print("Accuracy :",accuracy.eval({x: mnist.test.images[:3000], y:mnist.test.labels[:3000]}))
>>
Epoch: 0001 cost= 0.549793657
Epoch: 0002 cost= 0.365917619
Epoch: 0003 cost= 0.336471418
Epoch: 0004 cost= 0.321302868
Epoch: 0005 cost= 0.311445449
Epoch: 0006 cost= 0.304242077
Epoch: 0007 cost= 0.298702431
Epoch: 0008 cost= 0.294171253
Epoch: 0009 cost= 0.290589680
Epoch: 0010 cost= 0.287615354
Epoch: 0011 cost= 0.285205509
Epoch: 0012 cost= 0.282605721
Epoch: 0013 cost= 0.280656712
Epoch: 0014 cost= 0.278752496
Epoch: 0015 cost= 0.276960958
Epoch: 0016 cost= 0.275564561
Epoch: 0017 cost= 0.274063677
Epoch: 0018 cost= 0.272891480
Epoch: 0019 cost= 0.271752509
Epoch: 0020 cost= 0.270317350
Epoch: 0021 cost= 0.269795367
Epoch: 0022 cost= 0.268844057
Epoch: 0023 cost= 0.267684363
Epoch: 0024 cost= 0.266922297
Epoch: 0025 cost= 0.265854956
Epoch: 0026 cost= 0.265229598
Epoch: 0027 cost= 0.264450046
Epoch: 0028 cost= 0.263791721
Epoch: 0029 cost= 0.263281115
Epoch: 0030 cost= 0.262600577
Epoch: 0031 cost= 0.262007129
Epoch: 0032 cost= 0.261414622
Epoch: 0033 cost= 0.260831268
Epoch: 0034 cost= 0.260189309
Epoch: 0035 cost= 0.259817611
Epoch: 0036 cost= 0.259107615
Epoch: 0037 cost= 0.258915079
Epoch: 0038 cost= 0.258304055
Epoch: 0039 cost= 0.257926129
Epoch: 0040 cost= 0.257498951
Epoch: 0041 cost= 0.257099035
Epoch: 0042 cost= 0.256646989
Epoch: 0043 cost= 0.256067123
Epoch: 0044 cost= 0.255762136
Epoch: 0045 cost= 0.255634582
Epoch: 0046 cost= 0.255250435
Epoch: 0047 cost= 0.254679573
Epoch: 0048 cost= 0.254290587
Epoch: 0049 cost= 0.254138282
Epoch: 0050 cost= 0.253763084
Epoch: 0051 cost= 0.253556872
Epoch: 0052 cost= 0.253135801
Epoch: 0053 cost= 0.253152398
Epoch: 0054 cost= 0.252507659
Epoch: 0055 cost= 0.252219338
Epoch: 0056 cost= 0.252087212
Epoch: 0057 cost= 0.251848758
Epoch: 0058 cost= 0.251586332
Epoch: 0059 cost= 0.250984929
Epoch: 0060 cost= 0.250978480
Epoch: 0061 cost= 0.250628741
Epoch: 0062 cost= 0.250371252
Epoch: 0063 cost= 0.250305910
Epoch: 0064 cost= 0.249886633
Epoch: 0065 cost= 0.249741890
Epoch: 0066 cost= 0.249591898
Epoch: 0067 cost= 0.249295501
Epoch: 0068 cost= 0.248912337
Epoch: 0069 cost= 0.248885540
Epoch: 0070 cost= 0.248635491
Epoch: 0071 cost= 0.248499536
Epoch: 0072 cost= 0.248069460
Epoch: 0073 cost= 0.247886870
Epoch: 0074 cost= 0.247626629
Epoch: 0075 cost= 0.247720547
Epoch: 0076 cost= 0.247139938
Epoch: 0077 cost= 0.247207163
Epoch: 0078 cost= 0.246980976
Epoch: 0079 cost= 0.246837126
Epoch: 0080 cost= 0.246621032
Epoch: 0081 cost= 0.246356185
Epoch: 0082 cost= 0.246140922
Epoch: 0083 cost= 0.246141711
Epoch: 0084 cost= 0.246086150
Epoch: 0085 cost= 0.245534417
Epoch: 0086 cost= 0.245546024
Epoch: 0087 cost= 0.245530334
Epoch: 0088 cost= 0.245331954
Epoch: 0089 cost= 0.245188148
Epoch: 0090 cost= 0.244963982
Epoch: 0091 cost= 0.244896263
Epoch: 0092 cost= 0.244639043
Epoch: 0093 cost= 0.244361808
Epoch: 0094 cost= 0.244400453
Epoch: 0095 cost= 0.244161099
Epoch: 0096 cost= 0.244207886
Epoch: 0097 cost= 0.243744206
Epoch: 0098 cost= 0.243778737
Epoch: 0099 cost= 0.243626120
Epoch: 0100 cost= 0.243623733
Optimization Finished
Accuracy : 0.900333
약 90%의 정확도를 보인다.
Wide Neural Network
import tensorflow as tf
# Import MINST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
learning_rate = 0.001
training_epochs = 15
batch_size = 100
display_step = 1
x = tf.placeholder(tf.float32, [None, 784]) # MNIST data image of shape 28 * 28 = 784
y = tf.placeholder(tf.float32, [None, 10]) # 0-9 digits recognition => 10 classes
W1 = tf.Variable(tf.random_normal([784,256]))
W2 = tf.Variable(tf.random_normal([256,256]))
W3 = tf.Variable(tf.random_normal([256,10]))
b1 = tf.Variable(tf.random_normal([256]))
b2 = tf.Variable(tf.random_normal([256]))
b3 = tf.Variable(tf.random_normal([10]))
# Our hypothesis
L1 = tf.nn.relu(tf.add(tf.matmul(x, W1), b1))
L2 = tf.nn.relu(tf.add(tf.matmul(L1, W2), b2))
hypothesis = tf.add(tf.matmul(L2, W3), b3)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(hypothesis, y))
#optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(mnist.train.num_examples/batch_size)
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
sess.run(optimizer, feed_dict = {x: batch_xs, y: batch_ys})
avg_cost += sess.run(cost, feed_dict={x: batch_xs,y: batch_ys})/total_batch
if epoch % display_step == 0 :
print("Epoch:", "%04d" % (epoch+1) , "cost=", "{:.9f}".format(avg_cost))
print("Optimization Finished")
correct_prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print("Accuracy :",accuracy.eval({x: mnist.test.images, y:mnist.test.labels}))
>>
Epoch: 0001 cost= 160.840851151
Epoch: 0002 cost= 40.710142645
Epoch: 0003 cost= 25.247591665
Epoch: 0004 cost= 17.450017703
Epoch: 0005 cost= 12.445894088
Epoch: 0006 cost= 9.229762022
Epoch: 0007 cost= 6.805903080
Epoch: 0008 cost= 5.057814985
Epoch: 0009 cost= 3.710952040
Epoch: 0010 cost= 2.739558045
Epoch: 0011 cost= 1.947978113
Epoch: 0012 cost= 1.378158916
Epoch: 0013 cost= 1.046368325
Epoch: 0014 cost= 0.729827283
Epoch: 0015 cost= 0.522628606
Optimization Finished
Accuracy : 0.9434
약 94%의 정확도를 보인다.
Wide Neural Network with Xavier initialization
Weight의 초기값을 좀 더 현명하게 할당해보면 어떨까? Xavier initialization를 사용하면 input 노드와 output 노드에 맞게 초기 weight값을 좀 더 ‘현명’하게 정해줄 수 있다.
def xavier_init(n_inputs, n_outputs, uniform=True):
if uniform:
init_range = tf.sqrt(6.0 / (n_inputs + n_outputs))
return tf.random_uniform_initializer(-init_range, init_range)
else:
stddev = tf.sqrt(3.0 / (n_inputs + n_outputs))
return tf.truncated_normal_initializer(stddev=stddev)
import tensorflow as tf
# Import MINST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
learning_rate = 0.001
training_epochs = 15
batch_size = 100
display_step = 1
x = tf.placeholder(tf.float32, [None, 784]) # MNIST data image of shape 28 * 28 = 784
y = tf.placeholder(tf.float32, [None, 10]) # 0-9 digits recognition => 10 classes
W1 = tf.get_variable("W1",shape=[784,256], initializer=xavier_init(784,256))
W2 = tf.get_variable("W2",shape=[256,256], initializer=xavier_init(256,256))
W3 = tf.get_variable("W3",shape=[256,10], initializer=xavier_init(256,10))
b1 = tf.Variable(tf.random_normal([256]))
b2 = tf.Variable(tf.random_normal([256]))
b3 = tf.Variable(tf.random_normal([10]))
# Our hypothesis
L1 = tf.nn.relu(tf.add(tf.matmul(x, W1), b1))
L2 = tf.nn.relu(tf.add(tf.matmul(L1, W2), b2))
hypothesis = tf.add(tf.matmul(L2, W3), b3)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(hypothesis, y))
#optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(mnist.train.num_examples/batch_size)
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
sess.run(optimizer, feed_dict = {x: batch_xs, y: batch_ys})
avg_cost += sess.run(cost, feed_dict={x: batch_xs,y: batch_ys})/total_batch
if epoch % display_step == 0 :
print("Epoch:", "%04d" % (epoch+1) , "cost=", "{:.9f}".format(avg_cost))
print("Optimization Finished")
correct_prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print("Accuracy :",accuracy.eval({x: mnist.test.images, y:mnist.test.labels}))
>>
Epoch: 0001 cost= 0.339885130
Epoch: 0002 cost= 0.110329526
Epoch: 0003 cost= 0.066120946
Epoch: 0004 cost= 0.045542726
Epoch: 0005 cost= 0.031215971
Epoch: 0006 cost= 0.021620217
Epoch: 0007 cost= 0.016331682
Epoch: 0008 cost= 0.011938562
Epoch: 0009 cost= 0.009188881
Epoch: 0010 cost= 0.007762998
Epoch: 0011 cost= 0.005782462
Epoch: 0012 cost= 0.005591247
Epoch: 0013 cost= 0.003899493
Epoch: 0014 cost= 0.003532865
Epoch: 0015 cost= 0.003765834
Optimization Finished
Accuracy : 0.9774
약 97~98%의 정확도를 보인다.
Deep & Wide NN with xavier initializer & dropout
깊은 네트워크에서는 오버피팅이 발생할 가능성이 높다. 그렇기 때문에 신경망 노드 일부를 학습에서 배제해두는 Dropout 방법론을 도입해보자.
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
def xavier_init(n_inputs, n_outputs, uniform=True):
if uniform:
init_range = tf.sqrt(6.0 / (n_inputs + n_outputs))
return tf.random_uniform_initializer(-init_range, init_range)
else:
stddev = tf.sqrt(3.0 / (n_inputs + n_outputs))
return tf.truncated_normal_initializer(stddev=stddev)
# Import MINST data
learning_rate = 0.001
training_epochs = 15
batch_size = 100
display_step = 1
dropout_rate = tf.placeholder(tf.float32)
x = tf.placeholder(tf.float32, [None, 784]) # MNIST data image of shape 28 * 28 = 784
y = tf.placeholder(tf.float32, [None, 10]) # 0-9 digits recognition => 10 classes
W1 = tf.get_variable("W1",shape=[784,512], initializer=xavier_init(784,512))
W2 = tf.get_variable("W2",shape=[512,256], initializer=xavier_init(512,256))
W3 = tf.get_variable("W3",shape=[256,128], initializer=xavier_init(256,128))
W4 = tf.get_variable("W4",shape=[128,64], initializer=xavier_init(128,64))
W5 = tf.get_variable("W5",shape=[64,10], initializer=xavier_init(64,10))
b1 = tf.Variable(tf.random_normal([512]))
b2 = tf.Variable(tf.random_normal([256]))
b3 = tf.Variable(tf.random_normal([128]))
b4 = tf.Variable(tf.random_normal([64]))
b5 = tf.Variable(tf.random_normal([10]))
# Our hypothesis
_L1 = tf.nn.relu(tf.add(tf.matmul(x, W1), b1))
L1 = tf.nn.dropout(_L1, dropout_rate)
_L2 = tf.nn.relu(tf.add(tf.matmul(L1, W2), b2))
L2 = tf.nn.dropout(_L2, dropout_rate)
_L3 = tf.nn.relu(tf.add(tf.matmul(L2, W3), b3))
L3 = tf.nn.dropout(_L3, dropout_rate)
_L4 = tf.nn.relu(tf.add(tf.matmul(L3, W4), b4))
L4 = tf.nn.dropout(_L4, dropout_rate)
hypothesis = tf.add(tf.matmul(L4, W5), b5)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(hypothesis, y))
#optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(mnist.train.num_examples/batch_size)
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
sess.run(optimizer, feed_dict = {x: batch_xs, y: batch_ys,
dropout_rate: 0.7})
avg_cost += sess.run(cost, feed_dict={x: batch_xs,y: batch_ys,
dropout_rate: 0.7})/total_batch
if epoch % display_step == 0 :
print("Epoch:", "%04d" % (epoch+1) , "cost=", "{:.9f}".format(avg_cost))
print("Optimization Finished")
correct_prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print("Accuracy :",accuracy.eval({x: mnist.test.images,
y: mnist.test.labels,
dropout_rate: 1}))
>>
Epoch: 0001 cost= 0.575978510
Epoch: 0002 cost= 0.191286790
Epoch: 0003 cost= 0.134517717
Epoch: 0004 cost= 0.113371838
Epoch: 0005 cost= 0.091454296
Epoch: 0006 cost= 0.078287174
Epoch: 0007 cost= 0.070412253
Epoch: 0008 cost= 0.063096128
Epoch: 0009 cost= 0.055641586
Epoch: 0010 cost= 0.053606691
Epoch: 0011 cost= 0.046493807
Epoch: 0012 cost= 0.042869339
Epoch: 0013 cost= 0.038190212
Epoch: 0014 cost= 0.037902228
Epoch: 0015 cost= 0.035788863
Optimization Finished
Accuracy : 0.9823
놀랍게도 정확도가 98% 이상으로 dropout을 쓰지 않은 것보다 나은 결과를 보여준다.
dropout_rate는 대략 0.7 정도가 적당한 것 같으나 이 부분은 아직 고민해 볼 여지가 많은 것 같기도 하다.
Optimizer
학습을 위한 Optimizer도 굉장히 많이 준비되어 있다.
Stochastic Gradient Descent
momentum
nag
Adagrad
Adadelta
rmsprop
Adam
…
하지만 현재로서는 MNIST 테스트의 경우 Adam의 성능이 좋다고 한다.
Epoch 50
Epoch: 0001 cost= 0.690007022
Epoch: 0002 cost= 0.231413539
Epoch: 0003 cost= 0.166623234
Epoch: 0004 cost= 0.137392861
Epoch: 0005 cost= 0.109119156
Epoch: 0006 cost= 0.099077692
Epoch: 0007 cost= 0.087082866
Epoch: 0008 cost= 0.078429226
Epoch: 0009 cost= 0.070609569
Epoch: 0010 cost= 0.064453779
Epoch: 0011 cost= 0.061351138
Epoch: 0012 cost= 0.056578149
Epoch: 0013 cost= 0.051806200
Epoch: 0014 cost= 0.049847882
Epoch: 0015 cost= 0.045832138
Epoch: 0016 cost= 0.042040418
Epoch: 0017 cost= 0.043402059
Epoch: 0018 cost= 0.038806028
Epoch: 0019 cost= 0.043064067
Epoch: 0020 cost= 0.037600521
Epoch: 0021 cost= 0.037049865
Epoch: 0022 cost= 0.032759358
Epoch: 0023 cost= 0.035800081
Epoch: 0024 cost= 0.035921576
Epoch: 0025 cost= 0.029692400
Epoch: 0026 cost= 0.030480224
Epoch: 0027 cost= 0.029700849
Epoch: 0028 cost= 0.028042831
Epoch: 0029 cost= 0.028312489
Epoch: 0030 cost= 0.027937785
Epoch: 0031 cost= 0.028218212
Epoch: 0032 cost= 0.025205414
Epoch: 0033 cost= 0.025069857
Epoch: 0034 cost= 0.022753280
Epoch: 0035 cost= 0.026786121
Epoch: 0036 cost= 0.025479434
Epoch: 0037 cost= 0.022970497
Epoch: 0038 cost= 0.023919745
Epoch: 0039 cost= 0.021283284
Epoch: 0040 cost= 0.025076972
Epoch: 0041 cost= 0.021787082
Epoch: 0042 cost= 0.019106526
Epoch: 0043 cost= 0.023134691
Epoch: 0044 cost= 0.020448863
Epoch: 0045 cost= 0.021848984
Epoch: 0046 cost= 0.020627234
Epoch: 0047 cost= 0.017784834
Epoch: 0048 cost= 0.018144895
Epoch: 0049 cost= 0.020395084
Epoch: 0050 cost= 0.018059633
Optimization Finished
Accuracy : 0.9819
매 Epoch마다 Weight 값이 무작위적으로 할당된다는 점 덕분에 최적화를 통한 학습이 가능하지만, Weight의 값이 무작위적인 만큼 Epoch가 흘렀을 때 cost값이 증가하거나, 얼마 줄어들지 않는 일도 이따금 볼 수 있다. 그렇기 때문에 꼭 Epoch을 높게 주고 오랜 시간과 비용을 들였을 때 좋은 성과가 나온다는 보장은 없다.