More actions
No edit summary |
(Repair batch-0005 pages from live compare) |
||
| (5 intermediate revisions by 4 users not shown) | |||
| Line 1: | Line 1: | ||
[[머신러닝스터디/2016]] | |||
[[머신러닝스터디/2016/목차]] | |||
== 내용 == | == 내용 == | ||
* Embedding에는 word index가 필요함. | * Embedding에는 word index가 필요함. | ||
| Line 37: | Line 38: | ||
model.add(Dense(1, activation="sigmoid")) | model.add(Dense(1, activation="sigmoid")) | ||
model.compile(loss="binary_crossentropy", optimizer="adagrad", metrics= | model.compile(loss="binary_crossentropy", optimizer="adagrad", metrics=["accuracy"]) | ||
model.fit(X_train, y_train, batch_size=500, nb_epoch=100) | model.fit(X_train, y_train, batch_size=500, nb_epoch=100) | ||
| Line 43: | Line 44: | ||
pred = model.predict(X_test, batch_size=20000) | pred = model.predict(X_test, batch_size=20000) | ||
print (pred | print (pred[0], y_test[0]) | ||
print (pred | print (pred[1], y_test[1]) | ||
print (pred | print (pred[2], y_test[2]) | ||
=== Padding === | |||
pad_sequences은 배열의 길이가 다를 때 특정값을 채워넣어 길이를 맞춘다. | |||
X_train = pad_sequences(X_train, 1000) | |||
위의 코드는 X_train의 인풋 배열중 1000보다 길이가 짧은 배열에 0을 채워넣는다. | |||
그러나 1000보다 더 긴 배열을 줄여주진 않는다. | |||
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=1000) | |||
다음과 같이 input length가 될 단어 인덱스 길이를 1000으로 제한해야 한다. | |||
결과 예시 | |||
array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
0, 0, 1, 20, 28, 716, 48, 495, 79, 27, 493, 8, 2, | |||
7, 50, 5, 2, 2, 10, 5, 852, 157, 11, 5, 2, 2, | |||
10, 5, 500, 2, 6, 33, 256, 41, 2, 7, 17, 23, 48, | |||
2, 2, 26, 269, 929, 18, 2, 7, 2, 2, 8, 105, 5, | |||
2, 182, 314, 38, 98, 103, 7, 36, 2, 246, 360, 7, 19, | |||
396, 17, 26, 269, 929, 18, 2, 493, 6, 116, 7, 105, 5, | |||
575, 182, 27, 5, 2, 2, 130, 62, 17, 24, 89, 17, 13, | |||
381, 2, 8, 2, 7, 5, 2, 38, 325, 7, 17, 23, 93, | |||
9, 156, 252, 19, 235, 20, 28, 5, 104, 76, 7, 17, 169, | |||
35, 2, 17, 23, 2, 7, 36, 2, 934, 56, 2, 6, 17, | |||
891, 214, 11, 5, 2, 6, 92, 6, 33, 256, 82, 7], dtype=int32) | |||
nb_words로 배열의 최대 길이를 지정하지 않으면 Embedding 단계에서 out of index 에러가 난다. | |||
IndexError: index 4414 is out of bounds for size 1000 | |||
=== 학습 실패 === | |||
Using Theano backend. | |||
Epoch 1/10 | |||
22500/22500 [==============================] - 115s - loss: 0.6932 - acc: 0.5014 | |||
Epoch 2/10 | |||
22500/22500 [==============================] - 115s - loss: 0.6932 - acc: 0.5010 | |||
Epoch 3/10 | |||
22500/22500 [==============================] - 114s - loss: 0.6932 - acc: 0.5014 | |||
Epoch 4/10 | |||
22500/22500 [==============================] - 115s - loss: 0.6931 - acc: 0.5014 | |||
Epoch 5/10 | |||
22500/22500 [==============================] - 115s - loss: 0.6931 - acc: 0.5014 | |||
Epoch 6/10 | |||
22500/22500 [==============================] - 115s - loss: 0.6932 - acc: 0.5014 | |||
Epoch 7/10 | |||
22500/22500 [==============================] - 114s - loss: 0.6931 - acc: 0.5014 | |||
Epoch 8/10 | |||
22500/22500 [==============================] - 114s - loss: 0.6932 - acc: 0.5016 | |||
Epoch 9/10 | |||
22500/22500 [==============================] - 115s - loss: 0.6932 - acc: 0.5014 | |||
Epoch 10/10 | |||
22500/22500 [==============================] - 115s - loss: 0.6932 - acc: 0.5014 | |||
=== 학습 성공 === | |||
Epoch 1/10 | |||
22500/22500 [==============================] - 14282s - loss: 0.6927 - acc: 0.5164 | |||
Epoch 2/10 | |||
22500/22500 [==============================] - 10235s - loss: 0.6864 - acc: 0.5618 | |||
Epoch 3/10 | |||
22500/22500 [==============================] - 3236s - loss: 0.6541 - acc: 0.6508 | |||
Epoch 4/10 | |||
22500/22500 [==============================] - 3230s - loss: 0.5829 - acc: 0.7528 | |||
Epoch 5/10 | |||
22500/22500 [==============================] - 3222s - loss: 0.5490 - acc: 0.7745 | |||
Epoch 6/10 | |||
22500/22500 [==============================] - 3229s - loss: 0.5250 - acc: 0.7946 | |||
Epoch 7/10 | |||
22500/22500 [==============================] - 3230s - loss: 0.5052 - acc: 0.8030 | |||
Epoch 8/10 | |||
22300/22500 [============================>.] - ETA: 28s - loss: 0.4963 - acc: 0.8046 | |||
(사실 다음날 보니 프로세스가 죽어있어서 Epoch 8/10 이후의 결과는 없음... 학습 실패임) | |||
== 다음 시간에는 == | == 다음 시간에는 == | ||
* Coursera 동영상 week 7 보기 | * Coursera 동영상 week 7 보기 | ||
== 더 보기 == | == 더 보기 == | ||
Latest revision as of 00:44, 27 March 2026
내용
- Embedding에는 word index가 필요함.
- 초기에 Tokenizer로 word frequency를 input으로 썼는데 학습이 잘 안됨.
- [1]
tokenizer = Tokenizer(nb_words=1000) X_train = tokenizer.sequences_to_matrix(X_train, mode="freq")
- optimizer
- adamax 를 썼는데 accuracy가 50% 대에 머무름
- tensorflow는 adamax를 제공하지 않음. keras 자체 구현됨(code).
- 적절한 batch size
- batch size가 너무 작으면(e.g. 32) 학습이 오래 걸린다.
- 반면 너무 크면 메모리를 많이 사용하게 된다.
코드
import keras import numpy as np from keras.datasets import imdb from keras.preprocessing.text import Tokenizer from keras.models import Sequential from keras.layers import Dense, Dropout, Embedding, LSTM (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=1000) from keras.preprocessing.sequence import pad_sequences X_train = pad_sequences(X_train, 1000) X_test = pad_sequences(X_test, 1000) model = Sequential() model.add(Embedding(1000, 64, input_length=1000)) model.add(LSTM(output_dim=32, activation='sigmoid', inner_activation='hard_sigmoid')) model.add(Dense(16, activation="relu")) model.add(Dropout(0.5)) model.add(Dense(8, activation="relu")) model.add(Dropout(0.5)) model.add(Dense(1, activation="sigmoid")) model.compile(loss="binary_crossentropy", optimizer="adagrad", metrics=["accuracy"]) model.fit(X_train, y_train, batch_size=500, nb_epoch=100) model.evaluate(X_test, y_test, batch_size=1000) pred = model.predict(X_test, batch_size=20000) print (pred[0], y_test[0]) print (pred[1], y_test[1]) print (pred[2], y_test[2])
Padding
pad_sequences은 배열의 길이가 다를 때 특정값을 채워넣어 길이를 맞춘다.
X_train = pad_sequences(X_train, 1000)
위의 코드는 X_train의 인풋 배열중 1000보다 길이가 짧은 배열에 0을 채워넣는다. 그러나 1000보다 더 긴 배열을 줄여주진 않는다.
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=1000)
다음과 같이 input length가 될 단어 인덱스 길이를 1000으로 제한해야 한다.
결과 예시
array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 20, 28, 716, 48, 495, 79, 27, 493, 8, 2,
7, 50, 5, 2, 2, 10, 5, 852, 157, 11, 5, 2, 2,
10, 5, 500, 2, 6, 33, 256, 41, 2, 7, 17, 23, 48,
2, 2, 26, 269, 929, 18, 2, 7, 2, 2, 8, 105, 5,
2, 182, 314, 38, 98, 103, 7, 36, 2, 246, 360, 7, 19,
396, 17, 26, 269, 929, 18, 2, 493, 6, 116, 7, 105, 5,
575, 182, 27, 5, 2, 2, 130, 62, 17, 24, 89, 17, 13,
381, 2, 8, 2, 7, 5, 2, 38, 325, 7, 17, 23, 93,
9, 156, 252, 19, 235, 20, 28, 5, 104, 76, 7, 17, 169,
35, 2, 17, 23, 2, 7, 36, 2, 934, 56, 2, 6, 17,
891, 214, 11, 5, 2, 6, 92, 6, 33, 256, 82, 7], dtype=int32)
nb_words로 배열의 최대 길이를 지정하지 않으면 Embedding 단계에서 out of index 에러가 난다.
IndexError: index 4414 is out of bounds for size 1000
학습 실패
Using Theano backend. Epoch 1/10 22500/22500 [==============================] - 115s - loss: 0.6932 - acc: 0.5014 Epoch 2/10 22500/22500 [==============================] - 115s - loss: 0.6932 - acc: 0.5010 Epoch 3/10 22500/22500 [==============================] - 114s - loss: 0.6932 - acc: 0.5014 Epoch 4/10 22500/22500 [==============================] - 115s - loss: 0.6931 - acc: 0.5014 Epoch 5/10 22500/22500 [==============================] - 115s - loss: 0.6931 - acc: 0.5014 Epoch 6/10 22500/22500 [==============================] - 115s - loss: 0.6932 - acc: 0.5014 Epoch 7/10 22500/22500 [==============================] - 114s - loss: 0.6931 - acc: 0.5014 Epoch 8/10 22500/22500 [==============================] - 114s - loss: 0.6932 - acc: 0.5016 Epoch 9/10 22500/22500 [==============================] - 115s - loss: 0.6932 - acc: 0.5014 Epoch 10/10 22500/22500 [==============================] - 115s - loss: 0.6932 - acc: 0.5014
학습 성공
Epoch 1/10 22500/22500 [==============================] - 14282s - loss: 0.6927 - acc: 0.5164 Epoch 2/10 22500/22500 [==============================] - 10235s - loss: 0.6864 - acc: 0.5618 Epoch 3/10 22500/22500 [==============================] - 3236s - loss: 0.6541 - acc: 0.6508 Epoch 4/10 22500/22500 [==============================] - 3230s - loss: 0.5829 - acc: 0.7528 Epoch 5/10 22500/22500 [==============================] - 3222s - loss: 0.5490 - acc: 0.7745 Epoch 6/10 22500/22500 [==============================] - 3229s - loss: 0.5250 - acc: 0.7946 Epoch 7/10 22500/22500 [==============================] - 3230s - loss: 0.5052 - acc: 0.8030 Epoch 8/10 22300/22500 [============================>.] - ETA: 28s - loss: 0.4963 - acc: 0.8046
(사실 다음날 보니 프로세스가 죽어있어서 Epoch 8/10 이후의 결과는 없음... 학습 실패임)
다음 시간에는
- Coursera 동영상 week 7 보기