lucasw
大佬教麻
你可以叫我 000 / Lucas
建國中學資訊社37th學術長
建國中學電子計算機研習社44th學術
校際交流群創群者
不會音遊不會競程不會數學的笨
資訊技能樹亂點但都一樣爛
專案爛尾大師
IZCC x SCINT x Ruby Taiwan 聯課負責人
建國中學電子計算機研習社44th學術+總務
是的,我是總務。在座的你各位下次記得交社費束脩給我
技能樹貧乏
想拿機器學習做專題結果只學會使用API
上屆社展烙跑到資訊社的叛徒
科班墊神
Input Layer
Hidden Layer
Output Layer
pip install numpy
pip install matplotlib
pip install keras
pip install tensorflow
numpy:提供矩陣功能
matplotlib:繪製圖表使用
keras&tensorflow:提供MNIST資料庫
MNIST數據集:一堆手寫數字圖片的數據集
常被用於人工智慧測試辨識數字使用
# act.py
import numpy as np
def relu(x):
return np.maximum(0, x)
def drelu(x):
return np.where(x > 0, 1, 0)
np.maximum(a, x)
相當於對陣列內每個元素取 max(a, x)
np.where(condition, a, b)
會將x的每一項使用參數中的判斷式做判斷
根據 true or false 會將輸出陣列的那一項
設定為a or b
第一步先來初始化一個神經網路的物件
class NeuralNetwork:
def __init__(self, layers: list[int], activation_function: Callable, dactivation_function: Callable=None, learning_rate: float=1e-3) -> None:
self.layers = layers
self.learning_rate = learning_rate
self.act = activation_function
self.dact = dactivation_function or self.d(activation_function)
self.delta = 1e-10
self.Z: list[np.ndarray] = [np.zeros(layers[0])]
self.W: list[np.ndarray] = [np.zeros(layers[0])]
self.B: list[np.ndarray] = [np.zeros(layers[0])]
self.output: list[np.ndarray] = [np.zeros(layers[0])]
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
def d(self, f: Callable) -> Callable:
delta = 1e-10j
def df(x): return f(x + delta).imag / delta.imag
return df
第一步先來初始化一個神經網路的物件
class NeuralNetwork:
def __init__(self, layers: list[int], activation_function: Callable, dactivation_function: Callable=None, learning_rate: float=1e-3) -> None:
self.layers = layers
self.learning_rate = learning_rate
self.act = activation_function
self.dact = dactivation_function or self.d(activation_function)
self.delta = 1e-10
self.Z: list[np.ndarray] = [np.zeros(layers[0])]
self.W: list[np.ndarray] = [np.zeros(layers[0])]
self.B: list[np.ndarray] = [np.zeros(layers[0])]
self.output: list[np.ndarray] = [np.zeros(layers[0])]
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
def d(self, f: Callable) -> Callable:
delta = 1e-10j
def df(x): return f(x + delta).imag / delta.imag
return df
我們希望能夠透過輸入來決定神經網路的結構
e.g. layers = [16, 32, 64, 32, 10]
那麼輸入層 = 16 輸出層 = 10 其他層同理
class NeuralNetwork:
def __init__(self, layers: list[int], activation_function: Callable, dactivation_function: Callable=None, learning_rate: float=1e-3) -> None:
self.layers = layers
self.learning_rate = learning_rate
self.act = activation_function
self.dact = dactivation_function or self.d(activation_function)
self.delta = 1e-10
self.Z: list[np.ndarray] = [np.zeros(layers[0])]
self.W: list[np.ndarray] = [np.zeros(layers[0])]
self.B: list[np.ndarray] = [np.zeros(layers[0])]
self.output: list[np.ndarray] = [np.zeros(layers[0])]
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
def d(self, f: Callable) -> Callable:
delta = 1e-10j
def df(x): return f(x + delta).imag / delta.imag
return df
設定學習率 梯度下降會使用到
class NeuralNetwork:
def __init__(self, layers: list[int], activation_function: Callable, dactivation_function: Callable=None, learning_rate: float=1e-3) -> None:
self.layers = layers
self.learning_rate = learning_rate
self.act = activation_function
self.dact = dactivation_function or self.d(activation_function)
self.delta = 1e-10
self.Z: list[np.ndarray] = [np.zeros(layers[0])]
self.W: list[np.ndarray] = [np.zeros(layers[0])]
self.B: list[np.ndarray] = [np.zeros(layers[0])]
self.output: list[np.ndarray] = [np.zeros(layers[0])]
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
def d(self, f: Callable) -> Callable:
delta = 1e-10j
def df(x): return f(x + delta).imag / delta.imag
return df
設定 activation functions
另外構建了一個函數在未給予dact的情況下自動微分
class NeuralNetwork:
def __init__(self, layers: list[int], activation_function: Callable, dactivation_function: Callable=None, learning_rate: float=1e-3) -> None:
self.layers = layers
self.learning_rate = learning_rate
self.act = activation_function
self.dact = dactivation_function or self.d(activation_function)
self.delta = 1e-10
self.Z: list[np.ndarray] = [np.zeros(layers[0])]
self.W: list[np.ndarray] = [np.zeros(layers[0])]
self.B: list[np.ndarray] = [np.zeros(layers[0])]
self.output: list[np.ndarray] = [np.zeros(layers[0])]
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
def d(self, f: Callable) -> Callable:
delta = 1e-10j
def df(x): return f(x + delta).imag / delta.imag
return df
設定一個很小的數字
用以防止特定地方可能除以0導致程式發生錯誤
class NeuralNetwork:
def __init__(self, layers: list[int], activation_function: Callable, dactivation_function: Callable=None, learning_rate: float=1e-3) -> None:
self.layers = layers
self.learning_rate = learning_rate
self.act = activation_function
self.dact = dactivation_function or self.d(activation_function)
self.delta = 1e-10
self.Z: list[np.ndarray] = [np.zeros(layers[0])]
self.W: list[np.ndarray] = [np.zeros(layers[0])]
self.B: list[np.ndarray] = [np.zeros(layers[0])]
self.output: list[np.ndarray] = [np.zeros(layers[0])]
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
def d(self, f: Callable) -> Callable:
delta = 1e-10j
def df(x): return f(x + delta).imag / delta.imag
return df
建立存放參數的陣列
並且事先以填充0初始化每個神經元的參數陣列大小
class NeuralNetwork:
def __init__(self, layers: list[int], activation_function: Callable, dactivation_function: Callable=None, learning_rate: float=1e-3) -> None:
self.layers = layers
self.learning_rate = learning_rate
self.act = activation_function
self.dact = dactivation_function or self.d(activation_function)
self.delta = 1e-10
self.Z: list[np.ndarray] = [np.zeros(layers[0])]
self.W: list[np.ndarray] = [np.zeros(layers[0])]
self.B: list[np.ndarray] = [np.zeros(layers[0])]
self.output: list[np.ndarray] = [np.zeros(layers[0])]
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
def d(self, f: Callable) -> Callable:
delta = 1e-10j
def df(x): return f(x + delta).imag / delta.imag
return df
有人可能會好奇這行的初始化方式
關於參數的初始化方式詳細會在
Problems 章節進行說明
def softmax(self, x):
exp_x = np.exp(x - np.max(x))
return exp_x / np.sum(exp_x)
def cross_entropy(self, y: np.ndarray) -> np.float64:
return -np.dot(y.T, np.log(self.output[-1] + self.delta))
在物件內建立好待會會使用到的算式
另外前面提到的delta在cross entropy時便使用到了
np.dot(x, y)
內積 詳細可見線性代數章節
np.exp(x)
對每個x的元素取 e^x
def save_params(self, filename: str="params.json"):
with open(filename, "w") as f:
json.dump({"W": self.W, "B": self.B}, f, indent=4, cls=NumpyArrayEncoder)
def load_params(self, filename: str="params.json"):
with open(filename, "r") as f:
params = json.load(f)
self.W = []
self.B = []
for w in params["W"]: self.W.append(np.asarray(w))
for b in params["B"]: self.B.append(np.asarray(b))
設定個儲存參數的東西
可以避免每次都要重複花時間去訓練
def save_params(self, filename: str="params.json"):
with open(filename, "w") as f:
json.dump({"W": self.W, "B": self.B}, f, indent=4, cls=NumpyArrayEncoder)
def load_params(self, filename: str="params.json"):
with open(filename, "r") as f:
params = json.load(f)
self.W = []
self.B = []
for w in params["W"]: self.W.append(np.asarray(w))
for b in params["B"]: self.B.append(np.asarray(b))
設定個儲存參數的東西
可以避免每次都要重複花時間去訓練
另外由於 numpy array 無法被 json 序列化
因此需要使用自訂的 encoder
# encoder.py
import json
import numpy as np
class NumpyArrayEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
return json.JSONEncoder.default(self, obj)
另外由於 numpy array 無法被 json 序列化
因此需要使用自訂的 encoder
自訂 encoder 會將物件內的每個 numpy array
進行 tolist() 操作
讓他可以變成 python 內建的 list 被儲存到 json
def forward(self, x: np.ndarray) -> np.ndarray:
assert x.shape[0] == self.layers[0]
self.output[0] = x
for i in range(1, len(self.layers)):
self.Z[i] = np.dot(self.W[i], self.output[i-1]) + self.B[i]
if i == len(self.layers)-1: self.output[i] = self.softmax(self.Z[i])
else: self.output[i] = self.act(self.Z[i])
return self.output[-1]
前向傳播 啟動
def forward(self, x: np.ndarray) -> np.ndarray:
assert x.shape[0] == self.layers[0]
self.output[0] = x
for i in range(1, len(self.layers)):
self.Z[i] = np.dot(self.W[i], self.output[i-1]) + self.B[i]
if i == len(self.layers)-1: self.output[i] = self.softmax(self.Z[i])
else: self.output[i] = self.act(self.Z[i])
return self.output[-1]
def forward(self, x: np.ndarray) -> np.ndarray:
assert x.shape[0] == self.layers[0]
self.output[0] = x
for i in range(1, len(self.layers)):
self.Z[i] = np.dot(self.W[i], self.output[i-1]) + self.B[i]
if i == len(self.layers)-1: self.output[i] = self.softmax(self.Z[i])
else: self.output[i] = self.act(self.Z[i])
return self.output[-1]
def backward(self, y: np.ndarray) -> None:
x = self.output[-1] - y
for i in range(len(self.layers)-1, 0, -1):
t = x * self.dact(self.Z[i])
x = np.dot(self.W[i].T, t)
self.W[i] -= self.learning_rate * np.outer(t, self.output[i-1])
self.B[i] -= self.learning_rate * t
反向傳播 啟動
def backward(self, y: np.ndarray) -> None:
x = self.output[-1] - y
for i in range(len(self.layers)-1, 0, -1):
t = x * self.dact(self.Z[i])
x = np.dot(self.W[i].T, t)
self.W[i] -= self.learning_rate * np.outer(t, self.output[i-1])
self.B[i] -= self.learning_rate * t
def backward(self, y: np.ndarray) -> None:
x = self.output[-1] - y
for i in range(len(self.layers)-1, 0, -1):
t = x * self.dact(self.Z[i])
x = np.dot(self.W[i].T, t)
self.W[i] -= self.learning_rate * np.outer(t, self.output[i-1])
self.B[i] -= self.learning_rate * t
def backward(self, y: np.ndarray) -> None:
x = self.output[-1] - y
for i in range(len(self.layers)-1, 0, -1):
t = x * self.dact(self.Z[i])
x = np.dot(self.W[i].T, t)
self.W[i] -= self.learning_rate * np.outer(t, self.output[i-1])
self.B[i] -= self.learning_rate * t
def backward(self, y: np.ndarray) -> None:
x = self.output[-1] - y
for i in range(len(self.layers)-1, 0, -1):
t = x * self.dact(self.Z[i])
x = np.dot(self.W[i].T, t)
self.W[i] -= self.learning_rate * np.outer(t, self.output[i-1])
self.B[i] -= self.learning_rate * t
def backward(self, y: np.ndarray) -> None:
x = self.output[-1] - y
for i in range(len(self.layers)-1, 0, -1):
t = x * self.dact(self.Z[i])
x = np.dot(self.W[i].T, t)
self.W[i] -= self.learning_rate * np.outer(t, self.output[i-1])
self.B[i] -= self.learning_rate * t
將反向傳播得到的資訊
乘上權重
def backward(self, y: np.ndarray) -> None:
x = self.output[-1] - y
for i in range(len(self.layers)-1, 0, -1):
t = x * self.dact(self.Z[i])
x = np.dot(self.W[i].T, t)
self.W[i] -= self.learning_rate * np.outer(t, self.output[i-1])
self.B[i] -= self.learning_rate * t
def backward(self, y: np.ndarray) -> None:
x = self.output[-1] - y
for i in range(len(self.layers)-1, 0, -1):
t = x * self.dact(self.Z[i])
x = np.dot(self.W[i].T, t)
self.W[i] -= self.learning_rate * np.outer(t, self.output[i-1])
self.B[i] -= self.learning_rate * t
因此w要多乘一個過未經微分的sigmoid的x
def fit(self, x: np.ndarray, y: np.ndarray) -> np.float64:
self.forward(x)
loss = self.cross_entropy(y)
self.backward(y)
return loss
每次進行一次最佳化就
過一次正向傳播計算loss後
進行反向傳播更新參數
# act.py
import numpy as np
def relu(x):
return np.maximum(0, x)
def drelu(x):
return np.where(x > 0, 1, 0)
# *)#$#*)%*)#$#)*%
# encoder.py
import json
import numpy as np
class NumpyArrayEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
return json.JSONEncoder.default(self, obj)
# *)#$#*)%*)#$#)*%
# nn.py
import json
from datetime import datetime
from typing import Callable
import numpy as np
from np_encoder import NumpyArrayEncoder
class NeuralNetwork:
def __init__(self, layers: list[int], activation_function: Callable, dactivation_function: Callable=None, learning_rate: float=1e-3) -> None:
self.layers = layers
self.learning_rate = learning_rate
self.act = activation_function
self.dact = dactivation_function or self.d(activation_function)
self.delta = 1e-10
self.Z: list[np.ndarray] = [np.zeros(layers[0])]
self.W: list[np.ndarray] = [np.zeros(layers[0])]
self.B: list[np.ndarray] = [np.zeros(layers[0])]
self.output: list[np.ndarray] = [np.zeros(layers[0])]
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
def d(self, f: Callable) -> Callable:
delta = 1e-10j
def df(x): return f(x + delta).imag / delta.imag
return df
def softmax(self, x):
exp_x = np.exp(x - np.max(x))
return exp_x / np.sum(exp_x)
def cross_entropy(self, y: np.ndarray) -> np.float64:
return -np.dot(y.T, np.log(self.output[-1] + self.delta))
def forward(self, x: np.ndarray) -> np.ndarray:
assert x.shape[0] == self.layers[0]
self.output[0] = x
for i in range(1, len(self.layers)):
self.Z[i] = np.dot(self.W[i], self.output[i-1]) + self.B[i]
if i == len(self.layers)-1: self.output[i] = self.softmax(self.Z[i])
else: self.output[i] = self.act(self.Z[i])
return self.output[-1]
def backward(self, y: np.ndarray) -> np.ndarray:
x = self.output[-1] - y
for i in reversed(range(1, len(self.layers))):
t = x * self.dact(self.Z[i])
x = np.dot(self.W[i].T, t)
self.W[i] -= self.learning_rate * np.outer(t, self.output[i-1])
self.B[i] -= self.learning_rate * t
return x
def fit(self, x: np.ndarray, y: np.ndarray) -> np.float64:
self.forward(x)
loss = self.cross_entropy(y)
self.backward(y)
return loss
def save_params(self, filename: str="params.json"):
with open(filename, "w") as f:
json.dump({"W": self.W, "B": self.B}, f, indent=4, cls=NumpyArrayEncoder)
def load_params(self, filename: str="params.json"):
with open(filename, "r") as f:
params = json.load(f)
self.W = []
self.B = []
for w in params["W"]: self.W.append(np.asarray(w))
for b in params["B"]: self.B.append(np.asarray(b))
# *)#$#*)%*)#$#)*%
我們期望運行神經網路的流程如下
訓練神經網路 => 測試模型準確率
也就是說我們的啟動檔案
會匯入模型以及MNIST
並且進行訓練與測試
最後會得到準確率的結果與過程的loss變化
建立一個 main.py
此時檔案結構應如下
接著來編寫 main.py
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import mnist
from nn import NeuralNetwork
from act import relu, drelu
先引入所需函式庫
設定參數
learning_rate: 學習率
data_size: 資料大小(等同輸入層的層數)
batch_size: 每次訓練並輸出的批次量
max_trains: 最大訓練次數
epochs: 總共訓練幾個epoch
save: 是否要儲存參數
learning_rate = 1e-3
data_size = 784
batch_size = 64
max_trains = 60000
epochs = 3
save = True
(x_train_image, y_train_label), (x_test_image, y_test_label) = mnist.load_data()
x_trains = np.array(x_train_image).reshape(len(x_train_image), 784).astype("float64")/255
x_tests = np.array(x_test_image).reshape(len(x_test_image), 784).astype("float64")/255
y_trains = np.eye(10)[y_train_label]
y_tests = np.eye(10)[y_test_label]
從MNIST中載入資料
並且將圖片轉換成一維陣列後設定型別
除以255是為了讓灰階區間從[0, 255]->[0, 1]
(x_train_image, y_train_label), (x_test_image, y_test_label) = mnist.load_data()
x_trains = np.array(x_train_image).reshape(len(x_train_image), 784).astype("float64")/255
x_tests = np.array(x_test_image).reshape(len(x_test_image), 784).astype("float64")/255
y_trains = np.eye(10)[y_train_label]
y_tests = np.eye(10)[y_test_label]
np.eye(n)會建立一個n*n的單位矩陣
(x_train_image, y_train_label), (x_test_image, y_test_label) = mnist.load_data()
x_trains = np.array(x_train_image).reshape(len(x_train_image), 784).astype("float64")/255
x_tests = np.array(x_test_image).reshape(len(x_test_image), 784).astype("float64")/255
y_trains = np.eye(10)[y_train_label]
y_tests = np.eye(10)[y_test_label]
np.eye(n)會建立一個n*n的單位矩陣
(x_train_image, y_train_label), (x_test_image, y_test_label) = mnist.load_data()
x_trains = np.array(x_train_image).reshape(len(x_train_image), 784).astype("float64")/255
x_tests = np.array(x_test_image).reshape(len(x_test_image), 784).astype("float64")/255
y_trains = np.eye(10)[y_train_label]
y_tests = np.eye(10)[y_test_label]
那為什麼要這麼做呢
因為我們要讓label的形式轉換一下
(x_train_image, y_train_label), (x_test_image, y_test_label) = mnist.load_data()
x_trains = np.array(x_train_image).reshape(len(x_train_image), 784).astype("float64")/255
x_tests = np.array(x_test_image).reshape(len(x_test_image), 784).astype("float64")/255
y_trains = np.eye(10)[y_train_label]
y_tests = np.eye(10)[y_test_label]
那為什麼要這麼做呢
因為我們要讓label的形式轉換一下
※它給的label是0-based 不用減一
nn = NeuralNetwork(layers=[784, 256, 128, 64, 10], activation_function=relu, dactivation_function=drelu, learning_rate=learning_rate)
train_loss = nn.train(x_trains, y_trains, epochs, batch_size, max_trains, save)
test_loss = nn.predict(x_tests, y_tests)
創建神經網路物件
並依序從訓練及預測函式拿到loss
(函式內容會在後面章節提到)
plt.plot(train_loss, label="Train Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()
使用matplotlib函式庫繪製loss的變化圖表
# *)#$#*)%*)#$#)*%
# main.py
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import mnist
from nn import NeuralNetwork
from act import relu, drelu
learning_rate = 1e-3
data_size = 784
batch_size = 64
max_trains = 60000
epochs = 3
save = True
(x_train_image, y_train_label), (x_test_image, y_test_label) = mnist.load_data()
x_trains = np.array(x_train_image).reshape(len(x_train_image), 784).astype("float64")/255
x_tests = np.array(x_test_image).reshape(len(x_test_image), 784).astype("float64")/255
y_trains = np.eye(10)[y_train_label]
y_tests = np.eye(10)[y_test_label]
nn = NeuralNetwork(layers=[784, 256, 128, 64, 10], activation_function=relu, dactivation_function=drelu, learning_rate=learning_rate)
train_loss = nn.train(x_trains, y_trains, epochs, batch_size, max_trains, save)
test_loss = nn.predict(x_tests, y_tests)
plt.plot(train_loss, label="Train Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()
接著來處理訓練(train)跟預測(predict)的函式
為了讓每次訓練的輸出
不要因為單次的偏差而顯得數據差過大
我們通常會以一個批次(batch)為單位去做訓練
而批次的大小
另外批次還有其他作用會在後續章節提到
而訓練也可以藉由重複訓練來加強參數
每個epoch便是將整個數據集訓練一遍
而epoch的次數越高 訓練時長越高
但相對的準確率就會提高
另外
最大訓練次數純粹是因為我懶得訓練太久而強制中止的東東
在不包含測試資料的資料集中也可以拿來保留資料做測試
def train(self, x_trains: np.ndarray, y_trains: np.ndarray, epochs: int, batch_size: int=64, max_trains: int=60000, save: bool=False) -> list[np.float64]:
train_loss = []
for epoch in range(epochs):
max_trains = min(max_trains, len(x_trains))
total_loss = 0
start_time = datetime.now()
for i in range(0, max_trains, batch_size):
x_batch = x_trains[i:i + batch_size]
y_batch = y_trains[i:i + batch_size]
batch_loss = 0
for x_train, y_train in zip(x_batch, y_batch):
batch_loss += self.fit(x_train, y_train)
loss = batch_loss / batch_size
total_loss += loss
train_loss.append(loss)
batch = i // batch_size + 1
batchs = max_trains // batch_size + 1
print(
"Batch {space}{batch}/{batchs}, Loss: {loss}, Average Loss: {avg_loss}"
.format(
space=" " * (len(str(batchs)) - len(str(batch))),
batch=batch,
batchs=batchs,
loss='%.5f' % loss,
avg_loss='%.5f' % (total_loss / batch)
)
)
print(
"Epoch {space}{epoch}/{epochs}, Loss: {loss}, Save: {save}, Time: {time}"
.format(
space=" " * (len(str(epochs)) + len(str(batchs)) * 2 - len(str(epoch)) * 3),
epoch=epoch + 1,
epochs=epochs,
loss='%.5f' % (total_loss / (max_trains // batch_size + 1)),
save=save,
time=datetime.now() - start_time
)
)
if save:
self.save_params()
return train_loss
幾乎同訓練
不過不會更新參數
並且加上準確率
def predict(self, x_tests: np.ndarray, y_tests: np.ndarray) -> list[np.float64]:
test_loss = []
accuracy = 0
for i, (x_test, y_test) in enumerate(zip(x_tests, y_tests)):
output = self.forward(x_test)
loss = self.cross_entropy(y_test)
correct = output.argmax() == y_test.argmax()
if correct:
accuracy += 1
test_loss.append(loss)
print(
"Test {space}{test}/{tests}, Loss: {loss}, Correct: {correct}"
.format(
space=" " * (len(str(len(x_tests))) - len(str(i+1))),
test=i + 1,
tests=len(x_tests),
loss='%.5f' % loss,
correct=correct
)
)
print(f"Average test loss: {sum(test_loss) / len(test_loss)}")
print(f"Accuracy: {accuracy / len(x_tests)}")
return test_loss
# nn.py
import json
from datetime import datetime
from typing import Callable
import numpy as np
from np_encoder import NumpyArrayEncoder
class NeuralNetwork:
def __init__(self, layers: list[int], activation_function: Callable, dactivation_function: Callable=None, learning_rate: float=1e-3) -> None:
self.layers = layers
self.learning_rate = learning_rate
self.act = activation_function
self.dact = dactivation_function or self.d(activation_function)
self.delta = 1e-10
self.Z: list[np.ndarray] = [np.zeros(layers[0])]
self.W: list[np.ndarray] = [np.zeros(layers[0])]
self.B: list[np.ndarray] = [np.zeros(layers[0])]
self.output: list[np.ndarray] = [np.zeros(layers[0])]
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
def d(self, f: Callable) -> Callable:
delta = 1e-10j
def df(x): return f(x + delta).imag / delta.imag
return df
def softmax(self, x):
exp_x = np.exp(x - np.max(x))
return exp_x / np.sum(exp_x)
def cross_entropy(self, y: np.ndarray) -> np.float64:
return -np.dot(y.T, np.log(self.output[-1] + self.delta))
def forward(self, x: np.ndarray) -> np.ndarray:
assert x.shape[0] == self.layers[0]
self.output[0] = x
for i in range(1, len(self.layers)):
self.Z[i] = np.dot(self.W[i], self.output[i-1]) + self.B[i]
if i == len(self.layers)-1: self.output[i] = self.softmax(self.Z[i])
else: self.output[i] = self.act(self.Z[i])
return self.output[-1]
def backward(self, y: np.ndarray) -> np.ndarray:
x = self.output[-1] - y
for i in reversed(range(1, len(self.layers))):
t = x * self.dact(self.Z[i])
x = np.dot(self.W[i].T, t)
self.W[i] -= self.learning_rate * np.outer(t, self.output[i-1])
self.B[i] -= self.learning_rate * t
return x
def fit(self, x: np.ndarray, y: np.ndarray) -> np.float64:
self.forward(x)
loss = self.cross_entropy(y)
self.backward(y)
return loss
def predict(self, x_tests: np.ndarray, y_tests: np.ndarray) -> list[np.float64]:
test_loss = []
accuracy = 0
for i, (x_test, y_test) in enumerate(zip(x_tests, y_tests)):
output = self.forward(x_test)
loss = self.cross_entropy(y_test)
correct = output.argmax() == y_test.argmax()
if correct:
accuracy += 1
test_loss.append(loss)
print(
"Test {space}{test}/{tests}, Loss: {loss}, Correct: {correct}"
.format(
space=" " * (len(str(len(x_tests))) - len(str(i+1))),
test=i + 1,
tests=len(x_tests),
loss='%.5f' % loss,
correct=correct
)
)
print(f"Average test loss: {sum(test_loss) / len(test_loss)}")
print(f"Accuracy: {accuracy / len(x_tests)}")
return test_loss
def train(self, x_trains: np.ndarray, y_trains: np.ndarray, epochs: int, batch_size: int=64, max_trains: int=60000, save: bool=False) -> list[np.float64]:
train_loss = []
for epoch in range(epochs):
max_trains = min(max_trains, len(x_trains))
total_loss = 0
start_time = datetime.now()
for i in range(0, max_trains, batch_size):
x_batch = x_trains[i:i + batch_size]
y_batch = y_trains[i:i + batch_size]
batch_loss = 0
for x_train, y_train in zip(x_batch, y_batch):
batch_loss += self.fit(x_train, y_train)
loss = batch_loss / batch_size
total_loss += loss
train_loss.append(loss)
batch = i // batch_size + 1
batchs = max_trains // batch_size + 1
print(
"Batch {space}{batch}/{batchs}, Loss: {loss}, Average Loss: {avg_loss}"
.format(
space=" " * (len(str(batchs)) - len(str(batch))),
batch=batch,
batchs=batchs,
loss='%.5f' % loss,
avg_loss='%.5f' % (total_loss / batch)
)
)
print(
"Epoch {space}{epoch}/{epochs}, Loss: {loss}, Save: {save}, Time: {time}"
.format(
space=" " * (len(str(epochs)) + len(str(batchs)) * 2 - len(str(epoch)) * 3),
epoch=epoch + 1,
epochs=epochs,
loss='%.5f' % (total_loss / (max_trains // batch_size + 1)),
save=save,
time=datetime.now() - start_time
)
)
if save:
self.save_params()
return train_loss
def save_params(self, filename: str="params.json"):
with open(filename, "w") as f:
json.dump({"W": self.W, "B": self.B}, f, indent=4, cls=NumpyArrayEncoder)
def load_params(self, filename: str="params.json"):
with open(filename, "r") as f:
params = json.load(f)
self.W = []
self.B = []
for w in params["W"]: self.W.append(np.asarray(w))
for b in params["B"]: self.B.append(np.asarray(b))
# *)#$#*)%*)#$#)*%
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1])
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
np.zeros(self.layers[i])
def adam(self, grad: np.ndarray, t: int) -> np.ndarray:
self.m[self.gd_tag] = self.beta1 * self.m[self.gd_tag] + (1 - self.beta1) * grad
self.v[self.gd_tag] = self.beta2 * self.v[self.gd_tag] + (1 - self.beta2) * np.square(grad)
m_hat = self.m[self.gd_tag] / (1 - np.float_power(self.beta1, t))
v_hat = self.v[self.gd_tag] / (1 - np.float_power(self.beta2, t))
self.gd_tag += 1
return self.learning_rate * m_hat / (np.sqrt(v_hat) + self.delta)
def adam(self, grad: np.ndarray, t: int) -> np.ndarray:
self.m[self.gd_tag] = self.beta1 * self.m[self.gd_tag] + (1 - self.beta1) * grad
self.v[self.gd_tag] = self.beta2 * self.v[self.gd_tag] + (1 - self.beta2) * np.square(grad)
m_hat = self.m[self.gd_tag] / (1 - np.float_power(self.beta1, t))
v_hat = self.v[self.gd_tag] / (1 - np.float_power(self.beta2, t))
self.gd_tag += 1
return self.learning_rate * m_hat / (np.sqrt(v_hat) + self.delta)
def adam(self, grad: np.ndarray, t: int) -> np.ndarray:
self.m[self.gd_tag] = self.beta1 * self.m[self.gd_tag] + (1 - self.beta1) * grad
self.v[self.gd_tag] = self.beta2 * self.v[self.gd_tag] + (1 - self.beta2) * np.square(grad)
m_hat = self.m[self.gd_tag] / (1 - np.float_power(self.beta1, t))
v_hat = self.v[self.gd_tag] / (1 - np.float_power(self.beta2, t))
self.gd_tag += 1
return self.learning_rate * m_hat / (np.sqrt(v_hat) + self.delta)
def adam(self, grad: np.ndarray, t: int) -> np.ndarray:
self.m[self.gd_tag] = self.beta1 * self.m[self.gd_tag] + (1 - self.beta1) * grad
self.v[self.gd_tag] = self.beta2 * self.v[self.gd_tag] + (1 - self.beta2) * np.square(grad)
m_hat = self.m[self.gd_tag] / (1 - np.float_power(self.beta1, t))
v_hat = self.v[self.gd_tag] / (1 - np.float_power(self.beta2, t))
self.gd_tag += 1
return self.learning_rate * m_hat / (np.sqrt(v_hat) + self.delta)
# nn.py
import json
from datetime import datetime
from typing import Callable
import numpy as np
from np_encoder import NumpyArrayEncoder
class NeuralNetwork:
def __init__(self, layers: list[int], activation_function: Callable, dactivation_function: Callable=None, learning_rate: float=1e-3) -> None:
self.layers = layers
self.learning_rate = learning_rate
self.act = activation_function
self.dact = dactivation_function or self.d(activation_function)
self.delta = 1e-10
self.Z: list[np.ndarray] = [np.zeros(layers[0])]
self.W: list[np.ndarray] = [np.zeros(layers[0])]
self.B: list[np.ndarray] = [np.zeros(layers[0])]
self.output: list[np.ndarray] = [np.zeros(layers[0])]
self.m: list[np.ndarray] = [0 for _ in range((len(layers)-1)*2)]
self.v: list[np.ndarray] = [0 for _ in range((len(layers)-1)*2)]
self.beta1 = 0.9
self.beta2 = 0.999
self.gd_times = 1
self.gd_tag = 0
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
def d(self, f: Callable) -> Callable:
delta = 1e-10j
def df(x): return f(x + delta).imag / delta.imag
return df
def softmax(self, x):
exp_x = np.exp(x - np.max(x))
return exp_x / np.sum(exp_x)
def cross_entropy(self, y: np.ndarray) -> np.float64:
return -np.dot(y.T, np.log(self.output[-1] + self.delta))
def adam(self, grad: np.ndarray, t: int) -> np.ndarray:
self.m[self.gd_tag] = self.beta1 * self.m[self.gd_tag] + (1 - self.beta1) * grad
self.v[self.gd_tag] = self.beta2 * self.v[self.gd_tag] + (1 - self.beta2) * np.square(grad)
m_hat = self.m[self.gd_tag] / (1 - np.float_power(self.beta1, t))
v_hat = self.v[self.gd_tag] / (1 - np.float_power(self.beta2, t))
self.gd_tag += 1
return self.learning_rate * m_hat / (np.sqrt(v_hat) + self.delta)
def forward(self, x: np.ndarray) -> np.ndarray:
assert x.shape[0] == self.layers[0]
self.output[0] = x
for i in range(1, len(self.layers)):
self.Z[i] = np.dot(self.W[i], self.output[i-1]) + self.B[i]
if i == len(self.layers)-1: self.output[i] = self.softmax(self.Z[i])
else: self.output[i] = self.act(self.Z[i])
return self.output[-1]
def backward(self, y: np.ndarray) -> np.ndarray:
x = self.output[-1] - y
for i in reversed(range(1, len(self.layers))):
t = x * self.dact(self.Z[i])
x = np.dot(self.W[i].T, t)
self.W[i] -= self.adam(np.outer(t, self.output[i-1]), self.gd_times)
self.B[i] -= self.adam(t, self.gd_times)
self.gd_times += 1
self.gd_tag = 0
return x
def fit(self, x: np.ndarray, y: np.ndarray) -> np.float64:
self.forward(x)
loss = self.cross_entropy(y)
self.backward(y)
return loss
def predict(self, x_tests: np.ndarray, y_tests: np.ndarray) -> list[np.float64]:
test_loss = []
accuracy = 0
for i, (x_test, y_test) in enumerate(zip(x_tests, y_tests)):
output = self.forward(x_test)
loss = self.cross_entropy(y_test)
correct = output.argmax() == y_test.argmax()
if correct:
accuracy += 1
test_loss.append(loss)
print(
"Test {space}{test}/{tests}, Loss: {loss}, Correct: {correct}"
.format(
space=" " * (len(str(len(x_tests))) - len(str(i+1))),
test=i + 1,
tests=len(x_tests),
loss='%.5f' % loss,
correct=correct
)
)
print(f"Average test loss: {sum(test_loss) / len(test_loss)}")
print(f"Accuracy: {accuracy / len(x_tests)}")
return test_loss
def train(self, x_trains: np.ndarray, y_trains: np.ndarray, epochs: int, batch_size: int=64, max_trains: int=60000, save: bool=False) -> list[np.float64]:
train_loss = []
for epoch in range(epochs):
max_trains = min(max_trains, len(x_trains))
total_loss = 0
start_time = datetime.now()
for i in range(0, max_trains, batch_size):
x_batch = x_trains[i:i + batch_size]
y_batch = y_trains[i:i + batch_size]
batch_loss = 0
for x_train, y_train in zip(x_batch, y_batch):
batch_loss += self.fit(x_train, y_train)
loss = batch_loss / batch_size
total_loss += loss
train_loss.append(loss)
batch = i // batch_size + 1
batchs = max_trains // batch_size + 1
print(
"Batch {space}{batch}/{batchs}, Loss: {loss}, Average Loss: {avg_loss}"
.format(
space=" " * (len(str(batchs)) - len(str(batch))),
batch=batch,
batchs=batchs,
loss='%.5f' % loss,
avg_loss='%.5f' % (total_loss / batch)
)
)
print(
"Epoch {space}{epoch}/{epochs}, Loss: {loss}, Average Loss: {avg_loss}, Save: {save}, Time: {time}"
.format(
space=" " * (len(str(epochs)) + len(str(batchs)) * 2 - len(str(epoch)) * 3),
epoch=epoch + 1,
epochs=epochs,
loss='%.5f' % (total_loss / batch_size),
avg_loss='%.5f' % (total_loss / (max_trains // batch_size + 1)),
save=save,
time=datetime.now() - start_time
)
)
if save:
self.save_params()
return train_loss
def save_params(self, filename: str="params.json"):
with open(filename, "w") as f:
json.dump({"W": self.W, "B": self.B}, f, indent=4, cls=NumpyArrayEncoder)
def load_params(self, filename: str="params.json"):
with open(filename, "r") as f:
params = json.load(f)
self.W = []
self.B = []
for w in params["W"]: self.W.append(np.asarray(w))
for b in params["B"]: self.B.append(np.asarray(b))
# *)#$#*)%*)#$#)*%
還記得我們講過可以將資料進行標準化
來讓梯度下降時效率更好
同時也可以一定程度上限制數值範圍
還記得我們講過可以將資料進行標準化
來讓梯度下降時效率更好
同時也可以一定程度上限制數值範圍
因此我們會對每層輸出後的資料
(wx+b / act(wx+b))
在每個輸入資料上做標準化
而我們常會使用一個批次作為單位
對他做標準化
一般標準化的式子如下
一般標準化的式子如下
而我們通常會加上兩個參數做線性調整
這兩個參數是會被更新的
然後難受的就是反向傳播了
我相信我在這邊再解一次偏微分要變催眠大會了
所以廢話不多說直接上結果
有了要傳播到前面的dx以及
要更新的dgamma&dbeta後
就可以完成反向傳播了
有了要傳播到前面的dx以及
要更新的dgamma&dbeta後
就可以完成反向傳播了
然後就是我覺得反正應該沒有時間所以就不做實作了的環節了
另外
在測試時由於資料沒辦法一筆一筆batch的給
所以呢平均數跟變異數的計算時
會以當前所有吃過的資料的數值和做計算
更多資訊可以參考台大教授的影片
裡面最後還有提供其他的標準化方式
有興趣的可以慢慢研究
你的程式在資料集中跑出了99%的優異成績
正當你興高采烈的丟測試資料進去時
你可能會發現
怎麼跑這麼爛
這是因為你的程式已經
完全變成訓練資料的形狀了
簡單來講也就是擬合程度太高
使得他變成專解
這個訓練資料
而不是相同類型資料的函式了
這是因為你的程式已經
完全變成訓練資料的形狀了
簡單來講也就是擬合程度太高
使得他變成專解
這個訓練資料
而不是相同類型資料的函式了
這種情況我們稱之為過擬合
「共適應(Co-adaptation)是生物學中的一個概念
指的是兩個或多個相互作用的
特徵、基因或物種在自然選擇壓力下共同演化
彼此之間的適應性變化使得它們在一起時更具優勢
這些特徵只有在相互作用時才具有益處
這種過程有時會導致它們之間的相互依賴性增加」
by wiki+chatgpt
這樣一個特性也適用於神經網路中
並且可以透過抑制共適應來達到減緩過擬合
而現今最常使用的方式稱為Dropout
簡單來說就是透過在訓練時捨棄某些神經元
來抑制共適應
實作就給他個機率
對每個神經元在傳播時根據機率傳播值會變0
然後為了讓訓練跟測試時的期望值不受到dropout影響
通常會將測試時神經元輸出乘上(1 - dropout機率)
相當於訓練階段時的期望值
同理你也可以在訓練階段除以(1 - dropout機率)
import random
import json
from datetime import datetime
from typing import Callable
import numpy as np
from np_encoder import NumpyArrayEncoder
class NeuralNetwork:
def __init__(self, layers: list[int], activation_function: Callable, dactivation_function: Callable=None, learning_rate: float=1e-3, dropout: float=0.5) -> None:
self.layers = layers
self.learning_rate = learning_rate
self.act = activation_function
self.dact = dactivation_function or self.d(activation_function)
self.dropout = dropout
self.is_training = True
self.delta = 1e-10
self.Z: list[np.ndarray] = [np.zeros(layers[0])]
self.W: list[np.ndarray] = [np.zeros(layers[0])]
self.B: list[np.ndarray] = [np.zeros(layers[0])]
self.output: list[np.ndarray] = [np.zeros(layers[0])]
for i in range(1, len(self.layers)):
self.W.append(np.random.randn(self.layers[i], self.layers[i-1]) * np.sqrt(2/layers[i-1]))
self.B.append(np.zeros(self.layers[i]))
self.Z.append(np.zeros(self.layers[i]))
self.output.append(np.zeros(self.layers[i]))
def d(self, f: Callable) -> Callable:
delta = 1e-10j
def df(x): return f(x + delta).imag / delta.imag
return df
def softmax(self, x):
exp_x = np.exp(x - np.max(x))
return exp_x / np.sum(exp_x)
def cross_entropy(self, y: np.ndarray) -> np.float64:
return -np.dot(y.T, np.log(self.output[-1] + self.delta))
def forward(self, x: np.ndarray) -> np.ndarray:
assert x.shape[0] == self.layers[0]
self.output[0] = x
for i in range(1, len(self.layers)):
self.Z[i] = (np.dot(self.W[i], self.output[i-1]) + self.B[i])
if self.is_training:
self.Z[i] = np.where(np.random.rand(*self.Z[i].shape) < self.dropout, self.Z[i], 0) / (1 - self.dropout)
if i == len(self.layers)-1: self.output[i] = self.softmax(self.Z[i])
else: self.output[i] = self.act(self.Z[i])
return self.output[-1]
def backward(self, y: np.ndarray) -> np.ndarray:
x = self.output[-1] - y
for i in reversed(range(1, len(self.layers))):
t = x * self.dact(self.Z[i])
x = np.dot(self.W[i].T, t)
self.W[i] -= self.learning_rate * np.outer(t, self.output[i-1])
self.B[i] -= self.learning_rate * t
return x
def fit(self, x: np.ndarray, y: np.ndarray) -> np.float64:
self.forward(x)
loss = self.cross_entropy(y)
self.backward(y)
return loss
def predict(self, x_tests: np.ndarray, y_tests: np.ndarray) -> list[np.float64]:
self.is_training = False
test_loss = []
accuracy = 0
for i, (x_test, y_test) in enumerate(zip(x_tests, y_tests)):
output = self.forward(x_test)
loss = self.cross_entropy(y_test)
correct = output.argmax() == y_test.argmax()
if correct:
accuracy += 1
test_loss.append(loss)
print(
"Test {space}{test}/{tests}, Loss: {loss}, Correct: {correct}"
.format(
space=" " * (len(str(len(x_tests))) - len(str(i+1))),
test=i + 1,
tests=len(x_tests),
loss='%.5f' % loss,
correct=correct
)
)
print(f"Average test loss: {sum(test_loss) / len(test_loss)}")
print(f"Accuracy: {accuracy / len(x_tests)}")
return test_loss
def train(self, x_trains: np.ndarray, y_trains: np.ndarray, epochs: int, batch_size: int=64, max_trains: int=60000, save: bool=False) -> list[np.float64]:
self.is_training = True
train_loss = []
for epoch in range(epochs):
max_trains = min(max_trains, len(x_trains))
total_loss = 0
start_time = datetime.now()
for i in range(0, max_trains, batch_size):
x_batch = x_trains[i:i + batch_size]
y_batch = y_trains[i:i + batch_size]
batch_loss = 0
for x_train, y_train in zip(x_batch, y_batch):
batch_loss += self.fit(x_train, y_train)
loss = batch_loss / batch_size
total_loss += loss
train_loss.append(loss)
batch = i // batch_size + 1
batchs = max_trains // batch_size + 1
print(
"Batch {space}{batch}/{batchs}, Loss: {loss}, Average Loss: {avg_loss}"
.format(
space=" " * (len(str(batchs)) - len(str(batch))),
batch=batch,
batchs=batchs,
loss='%.5f' % loss,
avg_loss='%.5f' % (total_loss / batch)
)
)
print(
"Epoch {space}{epoch}/{epochs}, Loss: {loss}, Save: {save}, Time: {time}"
.format(
space=" " * (len(str(epochs)) + len(str(batchs)) * 2 - len(str(epoch)) * 3),
epoch=epoch + 1,
epochs=epochs,
loss='%.5f' % (total_loss / (max_trains // batch_size + 1)),
save=save,
time=datetime.now() - start_time
)
)
if save:
self.save_params()
return train_loss
def save_params(self, filename: str="params.json"):
with open(filename, "w") as f:
json.dump({"W": self.W, "B": self.B}, f, indent=4, cls=NumpyArrayEncoder)
def load_params(self, filename: str="params.json"):
with open(filename, "r") as f:
params = json.load(f)
self.W = []
self.B = []
for w in params["W"]: self.W.append(np.asarray(w))
for b in params["B"]: self.B.append(np.asarray(b))
# *)#$#*)%*)#$#)*%
有人可能會好奇為什麼不連矩陣一起手刻
因為numpy的程式是基於C語言
因此可以有比一般Python程式更快速的運算速度
有人可能會好奇為什麼不連矩陣一起手刻
因為numpy的程式是基於C語言
因此可以有比一般Python程式更快速的運算速度
但它還是有個缺點
那就是它是使用CPU在跑
讓numpy可以使用GPU跑的函式庫
使用方式就是把原本的
import numpy
import cupy
缺點是安裝麻煩一點
視情況要根據你的cuda版本安裝不同的函式庫
那當然還有個更快的辦法
也就是直接使用別人造好的輪子
整個神經網路都經過最完美的優化
從底層幫你做好
像是pytorch之類的
那當然還有個更快的辦法
也就是直接使用別人造好的輪子
整個神經網路都經過最完美的優化
從底層幫你做好
像是pytorch之類的
但這樣就不好玩了嘛
# by chatgpt
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
batch_size = 64
learning_rate = 1e-3
epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
class NN(nn.Module):
def __init__(self):
super(NN, self).__init__()
self.model = nn.Sequential(
nn.Flatten(),
nn.Linear(28*28, 256),
nn.ReLU(),
nn.Linear(256, 128),
nn.ReLU(),
nn.Linear(128, 64),
nn.ReLU(),
nn.Linear(64, 10)
)
def forward(self, x):
return self.model(x)
model = NN().to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(1, epochs + 1):
model.train()
running_loss = 0.0
correct = 0
total = 0
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item() * images.size(0)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
epoch_loss = running_loss / len(train_loader.dataset)
epoch_acc = 100 * correct / total
print(f"Epoch {epoch}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%")
所以有興趣都建議去玩玩看這些函式庫
手刻最主要是帶大家理解它的結構以及背後的數學
這邊手刻的各種東西你都能在裡面找到現成品
並且你知道這東西該怎麼調參、使用了
有時候你叛逆的神經網路會送你一個這樣的圖
讓你Train出來突然變一坨
有時候你叛逆的神經網路會送你一個這樣的圖
讓你Train出來突然變一坨
因此我們可以設定每N個Batch / Epoch做完時
紀錄Loss與當時的參數資料
這麼一來可以在發現Loss開始非期望中的變大時
直接將他停止繼續訓練
所有需要由使用者自行調整的皆為超參數
像是每層的神經元個數、批次大小、學習率等
這些參數在設定時可以參考現有模型的設定
或是網路上的文章教學
拿前人試出的結果總比自己花時間試好 (?
你會發現它在辨識不在資料集內的圖片時
表現的成果異常拙劣
一部份是由於沒給到其他風格的資料
但另一部份是我們的辨識方式
沒有經由特徵的判斷
而是直接根據像素點的位置去做計算
這要如何改善呢?
這要如何改善呢?
By lucasw