Department of Computer Science and Engineering, IIT Madras
|
---|
a2
a3
x1
x2
xn
The input layer can be called the 0-th layer and the output layer can be called the (L)-th layer
Wi∈Rn×n and bi∈Rn are the weight and bias between layers i−1 and i (0<i<L)
WL∈Rn×k and bL∈Rk are the weight and bias between the last hidden layer and the output layer (L=3) in this case)
a1
hL=y^=f(x)
h2
h1
W1
W1
b1
W2
b2
W3
b3
ai(x)=bi+Wihi−1(x)
hi(x)=g(ai(x))
f(x)=hL(x)=O(aL(x))
a2
a3
x1
x2
xn
a1
hL=y^=f(x)
h2
h1
W1
W1
b1
W2
b2
W3
b3
ai=bi+Wihi−1
hi=g(ai)
f(x)=hL=O(aL)
a2
a3
x1
x2
xn
a1
hL=y^=f(x)
h2
h1
W1
W1
b1
W2
b2
W3
b3
y^i=f(xi)=O(W3g(W2g(W1x+b1)+b2)+b3)
θ=W1,...,WL,b1,b2,...,bL(L=3)
minN1i=1∑Nj=1∑k(y^ij−yij)2
a2
a3
x1
x2
xn
a1
hL=y^=f(x)
h2
h1
W1
W1
b1
W2
b2
W3
b3
|
---|
a2
a3
x1
x2
xn
a1
hL=y^=f(x)
h2
h1
W1
W1
b1
W2
b2
W3
b3
wt+1←wt−η∇wt
bt+1←bt−η∇bt
t←0;
max_iterations←1000;
end
while t++ <max_iterations do
Initialize w0,b0;
a2
a3
x1
x2
xn
a1
hL=y^=f(x)
h2
h1
W1
W1
b1
W2
b2
W3
b3
t←0;
max_iterations←1000;
Initialize θ0=[w0,b0];
end
while t++ <max_iterations do
θt+1←θt−η∇θt
t←0;
max_iterations←1000;
Initialize θ0=[W10,...,WL0,b10,...,bL0];
end
while t++ <max_iterations do
θt+1←θt−η∇θt
a2
a3
x1
x2
xn
a1
hL=y^=f(x)
h2
h1
W1
W1
b1
W2
b2
W3
b3
|
---|
|
---|
L(θ)=N1i=1∑Nj=1∑3(y^ij−yij)2
Neural network with L−1 hidden layers
isActor Damon
isDirector
Nolan
imdb
Rating
Critics
Rating
RT
Rating
yi= {7.5 8.2 7.7}
xi
. .
. . . . . .
a2
a3
x1
x2
xn
a1
hL=y^=f(x)
h2
h1
W1
W1
b1
W2
b2
W3
b3
Intentionally left blank
Neural network with L−1 hidden layers
Apple
Mango
y= [1 0 0 0]
Orange
Banana
Neural network with L−1 hidden layers
Apple
Mango
y= [1 0 0 0]
Orange
Banana
a2
a3
x1
x2
xn
a1
hL=y^=f(x)
h2
h1
W1
W1
b1
W2
b2
W3
b3
a2
a3
x1
x2
xn
a1
hL=y^=f(x)
h2
h1
W1
W1
b1
W2
b2
W3
b3
Neural network with L−1 hidden layers
Apple
Mango
y= [1 0 0 0]
Orange
Banana
L(θ)=−c=1∑kyclogy^c
y^ℓ=[O(W3g(W2g(W1x+b1)+b2)+b3)]ℓ
a2
a3
x1
x2
xn
a1
hL=y^=f(x)
h2
h1
W1
W1
b1
W2
b2
W3
b3
Output Activation | ||
Loss Function |
Outputs |
---|
Real Values | Probabilities |
Linear
Softmax
Squared Error
Cross Entropy
Output Activation | ||
Loss Function |
Outputs |
---|
Real Values | Probabilities |
Linear
Softmax
Squared Error
Cross Entropy
|
---|
x1
x2
xn
a1
h1
h2
a1
a2
a3
y^=f(x)
b1
b2
b3
t←0;
max_iterations←1000;
Initialize θ0=[w0,b0];
end
while
t++ <max_iterations
do
θt+1←θt−η∇θt
W111
x1
a11
W211
a21
WL11
aL1
y^=f(x)
L(θ)
h11
h21
x1
a11
a21
aL1
y^=f(x)
L(θ)
h11
h21
W111
W211
WL11
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
W2
b2
W3
b3
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
W2
b2
W3
b3
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
W2
b2
b3
W3
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
W2
b2
b3
W3
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
W2
b2
b3
W3
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
b3
W3
b2
W2
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
b3
W3
b2
W2
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
b3
W3
b2
W2
W1
b1
Talk to the
weight directly
Talk to the output layer
Talk to the previous hidden layer
and now talk to
the weights
Talk to the previous hidden layer
Talk to the
weight directly
Talk to the output layer
Talk to the previous hidden layer
and now talk to the weights
Talk to the previous hidden layer
Talk to the
weight directly
Talk to the output layer
Talk to the previous hidden layer
and now talk to the weights
Talk to the previous hidden layer
=
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
W2
b2
W3
b3
∂y^i∂(−logy^ℓ)
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
W2
b2
W3
b3
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
W2
b2
W3
b3
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
W2
b2
W3
b3
Talk to the
weight directly
Talk to the output layer
Talk to the previous hidden layer
and now talk to the weights
Talk to the previous hidden layer
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
W2
b2
b3
W3
Intentionally left blank
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
W2
b2
b3
W3
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
W2
b2
b3
W3
−logy^ℓ
a3
x1
x2
xn
W1
W1
b1
W2
b2
b3
W3
Talk to the
weight directly
Talk to the output layer
Talk to the previous hidden layer
and now talk to the weights
Talk to the previous hidden layer
−logy^ℓ
a2
a3
x1
x2
xn
a1
h2
h1
W1
W1
b1
b3
W3
b2
W2
Intentionally left blank
−logy^ℓ
a3
x1
x2
xn
W1
W1
b1
b3
W3
b2
W2
t←0;
max_iterations←1000;
Initialize θ0=[W10,...,WL0,b10,...,bL0];
end
while t++ <max_iterations do
θt+1←θt−η∇θt
h1,h2,...,hL−1,a1,a2,...,aL,y^=forward _ propagation(θt)
∇θt=backward_propagation(h1,h2,...,hL−1,a1,a2,...,aL,y,y^)
ak=bk+Wkhk−1;
hk=g(ak)
aL=bL+WLhL−1;
y^=O(aL);
end
for k=1 to L−1 do
∇aLL(θ)=−(e(y)−y^);
∇WkL(θ)=∇akL(θ)hk−1T;
∇bkL(θ)=∇akL(θ);
∇hk−1L(θ)=WkT∇akL(θ);
∇ak−1L(θ)=∇hk−1L(θ)⊙[...,g′(ak−1,j),...];
end
for k=1 to L−1 do
g(z)=σ(z)
Logistic function |
---|
|
---|
=1+e−z1
g′(z)=(−1)(1+e−z)21dzd(1+e−z)
=(−1)(1+e−z)21(−e−z)
=(1+e−z)11+e−z1+e−z−1
=g(z)(1−g(z))
g(z)=tanh(z)
=ez+e−zez−e−z
g′(z)=(ez+e−z)2((ez+e−z)dzd(ez−e−z)−(ez−e−z)dzd(ez+e−z))
=(ez+e−z)2(ez+e−z)2−(ez−e−z)2
=1−(ez+e−z)2(ez−e−z)2
=1−(g(z))2
tanh function |
---|
|
---|