65 lines
No EOL
2.6 KiB
Text
65 lines
No EOL
2.6 KiB
Text
#let dfrac(a, b) = $display(frac(#a, #b))$
|
|
|
|
= Problem 1a
|
|
|
|
Given:
|
|
|
|
#let ww = $bold(w)$
|
|
#let xx = $bold(x)$
|
|
#let vv = $bold(v)$
|
|
#let XX = $bold(X)$
|
|
|
|
- $E(ww_1,ww_2,vv|XX) = - sum_t r^t log y^t + (1 - r^t) log(1 - y^t)$
|
|
- $y^t = "sigmoid"(v_2 z_2 + v_1 z_1 + v_0)$
|
|
- $z^t_1 = "ReLU"(w_(1,2)x^t_2 + w_(1,1)x^t_1 + w_(1,0))$
|
|
- $z^t_2 = tanh(w_(2,2)x^t_2 + w_(2,1)x^t_1 + w_(2,0))$
|
|
|
|
Using the convention $x_(j=1..D)$, $y_(i=1..K)$, and $z_(h=1..H)$.
|
|
|
|
Solved as:
|
|
|
|
- $
|
|
frac(diff E, diff v_h) &= - sum_t frac(diff E, diff y^t) frac(diff y^t, diff v_h) \
|
|
&= - sum_t (r^t dot frac(1, y^t) - (1-r^t) dot frac(1, 1-y^t)) (y^t z^t_h (1-y^t)) \
|
|
&= - sum_t (frac(r^t, y^t) - frac(1-r^t, 1-y^t)) (y^t z^t_h (1-y^t)) \
|
|
&= - sum_t (frac(r^t (1-y^t)-y^t (1-r^t), cancel(y^t) (1-y^t))) (cancel(y^t) z^t_h (1-y^t)) \
|
|
&= - sum_t (frac(r^t - y^t, cancel(1-y^t))) (z^t_h cancel((1-y^t))) \
|
|
&= - sum_t (r^t - y^t) z^t_h \
|
|
$
|
|
|
|
- $
|
|
frac(diff E, diff w_(1,j)) &= - sum_t frac(diff E, diff y^t) frac(diff y^t, diff z^t_h) frac(diff z^t_h, diff w_(1,j)) \
|
|
&= - sum_t (frac(r^t, y^t) - frac(1-r^t, 1-y^t)) (y^t (1-y^t) v_h) (x_h cases(0 "if" ww_1 dot xx <0, 1 "otherwise")) \
|
|
&= - sum_t (r^t - y^t) v_h x_h cases(0 "if" ww_1 dot xx <0, 1 "otherwise") \
|
|
$
|
|
|
|
- $
|
|
frac(diff E, diff w_(2,j)) &= - sum_t frac(diff E, diff y^t) frac(diff y^t, diff z^t_h) frac(diff z^t_h, diff w_(2,j)) \
|
|
&= - sum_t (r^t - y^t) v_h x_h (1-tanh^2(ww_2 dot xx)) \
|
|
$
|
|
|
|
Updates:
|
|
|
|
- $Delta v_h = eta sum_t (r^t-y^t) z^t_h$
|
|
- $Delta w_(1,j) = eta sum_t (r^t - y^t) v_h x_h cases(0 "if" ww_1 dot xx <0, 1 "otherwise")$
|
|
- $Delta w_(2,j) = eta sum_t (r^t - y^t) v_h x_h (1-tanh^2(ww_2 dot xx))$
|
|
|
|
= Problem 1b
|
|
|
|
- $E(ww,vv|XX) = - sum_t r^t log y^t + (1 - r^t) log (1 - y^t)$
|
|
- $y^t = "sigmoid"(v_2 z_2 + v_1 z_1 + v_0)$
|
|
- $z^t_1 = "ReLU"(w_2 x^t_2 + w_1 x^t_1 + w_0)$
|
|
- $z^t_2 = tanh(w_2 x^t_2 + w_1 x^t_1 + w_0)$
|
|
|
|
Updates:
|
|
|
|
-
|
|
Same as above:
|
|
$Delta v_h = eta sum_t (r^t-y^t) z^t_h$
|
|
|
|
- $
|
|
frac(diff E, diff w_j) &= - sum_t (frac(diff E, diff y^t) frac(diff y^t, diff z^t_1) frac(diff z^t_1, diff w_j)) + (frac(diff E, diff y^t) frac(diff y^t, diff z^t_2) frac(diff z^t_2, diff w_j)) \
|
|
&= - sum_t frac(diff E, diff y^t) (frac(diff y^t, diff z^t_1) frac(diff z^t_1, diff w_j) + frac(diff y^t, diff z^t_2) frac(diff z^t_2, diff w_j)) \
|
|
&= - sum_t (frac(r^t, y^t) - frac(1-r^t, 1-y^t)) (frac(diff y^t, diff z^t_1) frac(diff z^t_1, diff w_j) + frac(diff y^t, diff z^t_2) frac(diff z^t_2, diff w_j)) \
|
|
&= - sum_t (frac(r^t-y^t, y^t (1-y^t))) (frac(diff y^t, diff z^t_1) frac(diff z^t_1, diff w_j) + frac(diff y^t, diff z^t_2) frac(diff z^t_2, diff w_j)) \
|
|
$ |