Machine Learning Methodology & analysis 공부했다
계속해서 AI에서 사용하는 알고리즘들에 대해서 공부를 하고 있는 중이다. 항상 그러하듯이 이론과 실제는 다르다. 내가 좋아하고 자주 듣기도 하는 말이 있는데 바로 '하드웨어는 거짓말을 하지 않는다.'라는 것이다. 여기에서 하드웨어는 컴퓨터 그 자체를 의미하기도 한다.
내가 짠 프로그램이 작동하지 않는다고?? 하드웨어는 거짓말을 하지 않는다. 그냥 내가 잘못한거다!!!!
차치하고, 오늘 배운 내용들은 크게 다음과 같다.
● 데이터셋 나누기
● Underfitting / Overfitting
● Skewed Classes
● Precision/Recall
Copyright © 2018. Alina Inc. All Rights Reserved.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | import numpy as np import tensorflow as tf from tensorflow import keras def get_data(): boston_housing = keras.datasets.boston_housing (train_data, train_labels), (test_data, test_labels) = boston_housing.load_data(test_split=0.3) return train_data, np.expand_dims(train_labels, axis=1), test_data, np.expand_dims(test_labels, axis=1) x_train, y_train, x_test, y_test = get_data() #print(x_train.shape) #print(x_test.shape) #print(y_train.shape) num_inputs = x_train.shape[-1] num_outputs = y_train.shape[-1] print(num_inputs) print(num_outputs) x_input = tf.placeholder(tf.float32, [None, num_inputs]) y_input = tf.placeholder(tf.float32, [None, num_outputs]) theta0_var = tf.Variable(np.zeros([num_outputs], dtype=np.float32)) thetan_var = tf.Variable(np.zeros([num_inputs, num_outputs], dtype=np.float32)) y_output = theta0_var + tf.matmul(x_input, thetan_var) cost_output = tf.reduce_mean((y_input - y_output) ** 2) learning_rate = 0.001 train_step = tf.train.AdamOptimizer(learning_rate).minimize(cost_output) #print('Done') max_epoch = 10000 check_point = max_epoch / 5 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(max_epoch): sess.run(train_step, feed_dict={x_input: x_train, y_input: y_train}) if (i + 1) % check_point == 0: print('Done: {}%'.format(i / max_epoch * 100)) train_cost = sess.run(cost_output, feed_dict={x_input: x_train, y_input: y_train}) test_cost = sess.run(cost_output, feed_dict={x_input: x_test, y_input: y_test}) print('train cost: {:.2f}, test cost: {:.2f}'.format(train_cost, test_cost)) | cs |
Copyright © 2018. Alina Inc. All Rights Reserved.
Copyright © 2018. Alina Inc. All Rights Reserved.
Copyright © 2018. Alina Inc. All Rights Reserved.
Copyright © 2018. Alina Inc. All Rights Reserved.
Copyright © 2018. Alina Inc. All Rights Reserved.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.pipeline import Pipeline from sklearn.preprocessing import PolynomialFeatures def true_fun(X): return np.cos(1.5 * np.pi * X) def generate_data(size): X = np.sort(np.random.rand(size)) y = true_fun(X) + np.random.randn(size) * 0.1 return X[:, np.newaxis], y def build_model(degree): polynomial_features = PolynomialFeatures(degree=degree, include_bias=False) linear_regression = LinearRegression() return Pipeline([("polynomial_features", polynomial_features), ("linear_regression", linear_regression)]) def get_mse(model, X, y): y_pred = model.predict(X) return mean_squared_error(y, y_pred) def plot_model(model, X_train, y_train): X = np.linspace(0, 1, 100) plt.plot(X, model.predict(X[:, np.newaxis]), label="Model") plt.plot(X, true_fun(X), label="True function") plt.scatter(X_train, y_train, edgecolor='b', s=20, label="Samples") plt.xlabel("x") plt.ylabel("y") plt.xlim((0, 1)) plt.ylim((-2, 2)) plt.legend(loc="best") plt.title('Degree {}\nMSE = {:.2e}'.format(d, mses['train'][-1])) print('done') np.random.seed(0) data_size = 70 X_train, y_train = generate_data(int(data_size * 0.7)) X_val, y_val = generate_data(int(data_size * 0.3)) max_degree = 35 degrees = range(1, max_degree + 1) mses = {'train': [], 'val': []} plot_degrees = [1, 4, max_degree] plt.close('all') plt.figure(figsize=(14,5)) for d in degrees: model = build_model(d) model.fit(X_train, y_train) mses['train'].append(get_mse(model, X_train, y_train)) mses['val'].append(get_mse(model, X_val, y_val)) if d in plot_degrees: ax = plt.subplot(1, len(plot_degrees), plot_degrees.index(d) + 1) plt.setp(ax, xticks=(), yticks=()) plot_model(model, X_train, y_train) plt.show() plt.close('all') plt.plot(degrees, mses['train']) plt.plot(degrees, mses['val']) plt.xlabel('model complexity(degree of polynomial)') plt.ylabel('mean squared error') plt.legend(['train', 'validation'], loc='upper center') plt.show() | cs |
Copyright © 2018. Alina Inc. All Rights Reserved.
Copyright © 2018. Alina Inc. All Rights Reserved.
Copyright © 2018. Alina Inc. All Rights Reserved.
Copyright © 2018. Alina Inc. All Rights Reserved.
Copyright © 2018. Alina Inc. All Rights Reserved.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | import numpy as np import pandas as pd import urllib.request from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.metrics import precision_recall_curve import matplotlib.pyplot as plt def lr_train_and_predict(X, y): m = LogisticRegression().fit(X, y) return m, m.predict(X) def draw_precision_recall_graph(precision, recall): plt.close('all') plt.step(recall, precision, color='b', alpha=0.2, where='post') step_kwargs = {'step': 'post'} plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs) plt.xlabel('Recall') plt.ylabel('Precision') plt.show() # 데이터 다운로드 url = 'https://raw.githubusercontent.com/salopge/datasets/master/skewed_data.csv' urllib.request.urlretrieve(url, './skewed_data.csv') # 평형/기울어짐 2개의 상태로 나누기 data = pd.read_csv('skewed_data.csv', names=['var1', 'var2', 'var3', 'label']) # 데이터 학습 및 예측 X_org = data.loc[:, 'var1':'var3'] y_org = data.label model_org, pred_y_org = lr_train_and_predict(X_org, y_org) y_prob_org = model_org.predict_proba(X_org)[:, 1] print(y_prob_org[:5]) precision, recall, threshold = precision_recall_curve(y_org, y_prob_org) print('pre', precision[:5]) print('reca', recall[:5]) print(threshold[:5]) draw_precision_recall_graph(precision, recall) f1_score = 2 * precision * recall / (precision + recall) print(f1_score[:5]) best_threshold = threshold[np.argmax(f1_score)] print(best_threshold) | cs |