python - Is there a way to reduce the amount of code for RMSProp -
i have code simple recurrent neural network , know if there way me reduce amount of code necessary update stage. code have for:
class rnn(object): def__init___(self, data, hidden_size, eps=0.0001): self.data = data self.hidden_size = hidden_size self.weights_hidden = np.random.rand(hidden_size, hidden_size) * 0.1 # w self.weights_input = np.random.rand(hidden_size, len(data[0])) * 0.1 # u self.weights_output = np.random.rand(len(data[0]), hidden_size) * 0.1 # v self.bias_hidden = np.array([np.random.rand(hidden_size)]).t # b self.bias_output = np.array([np.random.rand(len(data[0]))]).t # c self.cache_w_hid, self.cache_w_in, self.cache_w_out = 0, 0, 0 self.cache_b_hid, self.cache_b_out = 0, 0 self.eps = eps def train(self, seq_length, epochs, eta, decay_rate=0.9, learning_decay=0.0): # other stuff self.update(seq, epoch, eta, decay_rate, learning_decay) # other stuff def update(self, seq, epoch, eta, decay_rate, learning_decay): """updates network's weights , biases applying gradient descent using backpropagation through time , rmsprop. """ delta_nabla_c, delta_nabla_b,\ delta_nabla_v, delta_nabla_w, delta_nabla_u = self.backward_pass(seq) eta = eta*np.exp(-epoch*learning_decay) self.cache_w_hid += decay_rate * self.cache_w_hid \ + (1 - decay_rate) * delta_nabla_w**2 self.weights_hidden -= eta * delta_nabla_w / (np.sqrt(self.cache_w_hid) + self.eps) self.cache_w_in += decay_rate * self.cache_w_in \ + (1 - decay_rate) * delta_nabla_u**2 self.weights_input -= eta * delta_nabla_u / (np.sqrt(self.cache_w_in) + self.eps) self.cache_w_out += decay_rate * self.cache_w_out \ + (1 - decay_rate) * delta_nabla_v**2 self.weights_output -= eta * delta_nabla_v / (np.sqrt(self.cache_w_out) + self.eps) self.cache_b_hid += decay_rate * self.cache_b_hid \ + (1 - decay_rate) * delta_nabla_b**2 self.bias_hidden -= eta * delta_nabla_b / (np.sqrt(self.cache_b_hid) + self.eps) self.cache_b_out += decay_rate * self.cache_b_out \ + (1 - decay_rate) * delta_nabla_c**2 self.bias_output -= eta * delta_nabla_c / (np.sqrt(self.cache_b_out) + self.eps)
for every variable under #rmsprop
follows update rule, namely:
cache = decay_rate * cache + (1 - decay_rate) * dx**2 x += - learning_rate * dx / (np.sqrt(cache) + eps)
i have cache_
declared followed self.weight_
or self.bias_
, have written more compactly. looking @ using zip()
i'm not sure how go that.
judging question, guessing trying improve readability/elegance on other kind of optimization here.
you can introduce function implement update rule, call once each variable. trick here python lets access attributes name, can pass in name of cache , weights attribute instead of value. let update value future passes:
def update_rule(self, cache_attr, x_attr, decay_rate, learning_rate, dx): cache = getattr(self, cache_attr) cache = decay_rate * cache + (1 - decay_rate) * dx**2 setattr(self, cache_attr, cache) x = getattr(self, x_attr) x += - learning_rate * dx / (np.sqrt(cache) + self.eps) setattr(self, x_attr, x) def update(self, seq, epoch, eta, decay_rate, learning_decay): """updates network's weights , biases applying gradient descent using backpropagation through time , rmsprop. """ delta_nabla_c, delta_nabla_b,\ delta_nabla_v, delta_nabla_w, delta_nabla_u = self.backward_pass(seq) eta = eta*np.exp(-epoch*learning_decay) self.update_rule('cache_w_hid', 'weights_hidden', decay_rate, eta, delta_nabla_w) self.update_rule('cache_w_in', 'weights_input', decay_rate, eta, delta_nabla_u) self.update_rule('cache_w_out', 'weights_output', decay_rate, eta, delta_nabla_v) self.update_rule('cache_b_hid', 'bias_hidden', decay_rate, eta, delta_nabla_b) self.update_rule('cache_b_out', 'bias_output', decay_rate, eta, delta_nabla_c)
in fact, can save additional parameters , avoid exposing private method putting update_rule
update
. expose namespace of update
update_rule
when called, not have pass in decay_rate
, learning_rate
:
def update(self, seq, epoch, eta, decay_rate, learning_decay): """updates network's weights , biases applying gradient descent using backpropagation through time , rmsprop. """ def update_rule(cache_attr, x_attr, dx): cache = getattr(self, cache_attr) cache = decay_rate * cache + (1 - decay_rate) * dx**2 setattr(self, cache_attr, cache) x = getattr(self, x_attr) x += - eta * dx / (np.sqrt(cache) + self.eps) setattr(self, x_attr, x) delta_nabla_c, delta_nabla_b,\ delta_nabla_v, delta_nabla_w, delta_nabla_u = self.backward_pass(seq) eta = eta*np.exp(-epoch*learning_decay) update_rule('cache_w_hid', 'weights_hidden', delta_nabla_w) update_rule('cache_w_in', 'weights_input', delta_nabla_u) update_rule('cache_w_out', 'weights_output', delta_nabla_v) update_rule('cache_b_hid', 'bias_hidden', delta_nabla_b) update_rule('cache_b_out', 'bias_output', delta_nabla_c)
finally, if wanted, use zip
put calls update_rule
loop. notice version, order of calls has been changed match order of values returned self.backward_pass
. not use last version unless had lot of updates because starting obfuscated in addition fact sensitive result of backward_pass
.
def update(self, seq, epoch, eta, decay_rate, learning_decay): """updates network's weights , biases applying gradient descent using backpropagation through time , rmsprop. """ def update_rule(cache_attr, x_attr, dx): cache = getattr(self, cache_attr) cache = decay_rate * cache + (1 - decay_rate) * dx**2 setattr(self, cache_attr, cache) x = getattr(self, x_attr) x += - eta * dx / (np.sqrt(cache) + self.eps) setattr(self, x_attr, x) dx = self.backward_pass(seq) eta = eta*np.exp(-epoch*learning_decay) cache_attrs = ('cache_b_out', 'cache_b_hid', 'cache_w_out', 'cache_w_hid', 'cache_w_in') x_attrs = ('bias_output', 'bias_hidden', 'weights_output', 'weights_hidden', 'weights_input') args in zip(cache_attrs, x_attrs, dx): update_rule(*args)
Comments
Post a Comment