Introduction to Echo State Network#

@Chaoming Wang

import brainpy as bp
import brainpy.math as bm
bm.set_platform('cpu')

# enable x64 computation
bm.enable_x64()
import numpy as np
import matplotlib.pyplot as plt

Echo State Network#

Echo State Networks (ESNs) are applied to supervised temporal machine learning tasks where for a given training input signal \(x(n)\) a desired target output signal \(y^{target}(n)\) is known. Here \(n=1, ..., T\) is the discrete time and \(T\) is the number of data points in the training dataset.

The task is to learn a model with output \(y(n)\), where \(y(n)\) matches \(y^{target}(n)\) as well as possible, minimizing an error measure \(E(y, y^{target})\), and, more importantly, generalizes well to unseen data.

ESNs use an RNN type with leaky-integrated discrete-time continuous-value units. The typical update equations are

\[\begin{split} \hat{h}(n) = \tanh(W^{in} x(n) + W^{rec}h(n-1) + W^{fb}y(n-1) + b^{rec}) \\ h(n) = (1 - \alpha) x(n-1)+\alpha \hat{h}(n) \end{split}\]

where \(h(n)\) is a vector of reservoir neuron activations, \(W^{in}\) and \(W^{rec}\) are the input and recurrent weight matrices respectively, and \(\alpha \in (0, 1]\) is the leaking rate. The model is also sometimes used without the leaky integration, which is a special case of \(\alpha=1\).

The linear readout layer is defined as

\[ y(n) = W^{out} h(n) + b^{out} \]

where \(y(n)\) is network output, \(W^{out}\) the output weight matrix, and \(b^{out}\) is the output bias.

An additional nonlinearity can be applied to \(y(n)\), as well as feedback connections \(W^{fb}\) from \(y(n-1)\) to \(\hat{h}(n)\).

A graphical representation of an ESN illustrating our notation and the idea for training is depicted in the following figure.

echo state machine

Ridge regression#

Finding the optimal weights \(W^{out}\) that minimize the squared error between \(y(n)\) and \(y^{target}(n)\) amounts to solving a typically overdetermined system of linear equations

\[Y^{target} = W^{out}X\]

Probably the most universal and stable solution is ridge regression, also known as regression with Tikhonov regularization:

\[W^{out} = Y^{target}X^T(XX^T+\beta I)^{-1}\]

Dataset#

Mackey-Glass equation are a set of delayed differential equations describing the temporal behaviour of different physiological signal, for example, the relative quantity of mature blood cells over time.

The equations are defined as:

\[ \frac{dP(t)}{dt} = \frac{\beta P(t - \tau)}{1 + P(t - \tau)^n} - \gamma P(t) \]

where \(\beta = 0.2\), \(\gamma = 0.1\), \(n = 10\), and the time delay \(\tau = 17\). \(\tau\) controls the chaotic behaviour of the equations (the higher it is, the more chaotic the timeserie becomes. \(\tau=17\) already gives good chaotic results.)

def plot_mackey_glass_series(ts, x_series, x_tau_series, num_sample):
  plt.figure(figsize=(13, 5))

  plt.subplot(121)
  plt.title(f"Timeserie - {num_sample} timesteps")
  plt.plot(ts[:num_sample], x_series[:num_sample], lw=2, color="lightgrey", zorder=0)
  plt.scatter(ts[:num_sample], x_series[:num_sample], c=ts[:num_sample], cmap="viridis", s=6)
  plt.xlabel("$t$")
  plt.ylabel("$P(t)$")

  ax = plt.subplot(122)
  ax.margins(0.05)
  plt.title(f"Phase diagram: $P(t) = f(P(t-\\tau))$")
  plt.plot(x_tau_series[: num_sample], x_series[: num_sample], lw=1, color="lightgrey", zorder=0)
  plt.scatter(x_tau_series[:num_sample], x_series[: num_sample], lw=0.5, c=ts[:num_sample], cmap="viridis", s=6)
  plt.xlabel("$P(t-\\tau)$")
  plt.ylabel("$P(t)$")
  cbar = plt.colorbar()
  cbar.ax.set_ylabel('$t$')

  plt.tight_layout()
  plt.show()

An easy way to get Mackey-Glass time-series data is using brainpy.dataset.mackey_glass_series(). If you want to see the details of the implementation, please see the corresponding source code.

dt = 0.1
mg_data = bp.datasets.mackey_glass_series(25000, dt=dt, tau=17, beta=0.2, gamma=0.1, n=10)
plot_mackey_glass_series(mg_data['ts'], mg_data['x'], mg_data['x_tau'], num_sample=int(1000 / dt))
../_images/5cd4d83947b21aa79538c434659d0f70cdff2e4b6221d7e4dcbd8899d29427dd.png

Task 1: prediction of Mackey-Glass timeseries#

Predict \(P(t+1), \cdots, P(t+N)\) from \(P(t)\).

Prepare the data#

def get_data(t_warm, t_forcast, t_train, sample_rate=1):
    warmup = int(t_warm / dt)  # warmup the reservoir
    forecast = int(t_forcast / dt)  # predict 10 ms ahead
    train_length = int(t_train / dt)

    X_warm = mg_data['x'][:warmup:sample_rate]
    X_warm = bm.expand_dims(X_warm, 0)

    X_train = mg_data['x'][warmup: warmup+train_length: sample_rate]
    X_train = bm.expand_dims(X_train, 0)

    Y_train = mg_data['x'][warmup+forecast: warmup+train_length+forecast: sample_rate]
    Y_train = bm.expand_dims(Y_train, 0)

    X_test = mg_data['x'][warmup + train_length: -forecast: sample_rate]
    X_test = bm.expand_dims(X_test, 0)

    Y_test = mg_data['x'][warmup + train_length + forecast::sample_rate]
    Y_test = bm.expand_dims(Y_test, 0)

    return X_warm, X_train, Y_train, X_test, Y_test
# First warmup the reservoir using the first 100 ms
# Then, train the network in 20000 ms to predict 1 ms chaotic series ahead
x_warm, x_train, y_train, x_test, y_test = get_data(100, 1, 20000)
sample = 3000
fig = plt.figure(figsize=(15, 5))
plt.plot(x_train[0, :sample], label="Training data")
plt.plot(y_train[0, :sample], label="True prediction")
plt.legend()
plt.show()
../_images/009fca2fe0ac43af1ad1e60bca0464d3953705cf3e3a31b831723bee2f5306a1.png

Prepare the ESN#

class ESN(bp.dyn.DynamicalSystem):
  def __init__(self, num_in, num_hidden, num_out, sr=1.,
               Win_initializer=bp.init.Uniform(0, 0.2),
               leaky_rate=0.3):
    super(ESN, self).__init__()
    self.r = bp.layers.Reservoir(
        num_in, num_hidden,
        Win_initializer=Win_initializer,
        spectral_radius=sr,
        leaky_rate=leaky_rate,
        mode=bp.modes.batching
    )
    self.o = bp.layers.Dense(num_hidden, num_out)

  def update(self, sha, x):
    return self.o(sha, self.r(sha, x))

Train and test ESN#

model = ESN(1, 100, 1)
model.reset_state(1)
trainer = bp.train.RidgeTrainer(model, alpha=1e-6)
# warmup
_ = trainer.predict(x_warm)
# train
_ = trainer.fit([x_train, y_train])

Test the training data.

ys_predict = trainer.predict(x_train)
start, end = 1000, 6000
plt.figure(figsize=(15, 7))
plt.subplot(211)
plt.plot(bm.as_numpy(ys_predict)[0, start:end, 0],
         lw=3, label="ESN prediction")
plt.plot(bm.as_numpy(y_train)[0, start:end, 0], linestyle="--",
         lw=2, label="True value")
plt.title(f'Mean Square Error: {bp.losses.mean_squared_error(ys_predict, y_train)}')
plt.legend()
plt.show()
../_images/7efc028201e5f639c7debfa980dbfd9ab5bc90f3bf71393c5615c0aec4309c7a.png

Test the testing data.

ys_predict = trainer.predict(x_test)

start, end = 1000, 6000
plt.figure(figsize=(15, 7))
plt.subplot(211)
plt.plot(bm.as_numpy(ys_predict)[0, start:end, 0], lw=3, label="ESN prediction")
plt.plot(bm.as_numpy(y_test)[0,start:end, 0], linestyle="--", lw=2, label="True value")
plt.title(f'Mean Square Error: {bp.losses.mean_squared_error(ys_predict, y_test)}')
plt.legend()
plt.show()
../_images/1cb2b7075a91307c64cf333bedee92048f6a770f6396f4755814e69634d97b3e.png

Make the task harder#

# First warmup the reservoir using the first 100 ms
# Then, train the network in 20000 ms to predict 10 ms chaotic series ahead
x_warm, x_train, y_train, x_test, y_test = get_data(100, 10, 20000)
sample = 3000
plt.figure(figsize=(15, 5))
plt.plot(x_train[0, :sample], label="Training data")
plt.plot(y_train[0, :sample], label="True prediction")
plt.legend()
plt.show()
../_images/5d2a53f947ac8b1adfaed82109522cd8f544ddef01e30763c003812d0fda8506.png
model = ESN(1, 100, 1, sr=1.1)
model.reset_state(1)
trainer = bp.train.RidgeTrainer(model, alpha=1e-6)
# warmup
_ = trainer.predict(x_warm)

# train
_ = trainer.fit([x_train, y_train])
ys_predict = trainer.predict(x_test, )

start, end = 1000, 6000
plt.figure(figsize=(15, 7))
plt.subplot(211)
plt.plot(bm.as_numpy(ys_predict)[0, start:end, 0], lw=3, label="ESN prediction")
plt.plot(bm.as_numpy(y_test)[0, start:end, 0], linestyle="--", lw=2, label="True value")
plt.title(f'Mean Square Error: {bp.losses.mean_squared_error(ys_predict, y_test)}')
plt.legend()
plt.show()
../_images/d1a9101029615287817b63841348c5ccd11f43b1bace0543bb63e2d4733c9702.png

Diving into the reservoir#

Let’s have a look at the effect of some of the hyperparameters of the ESN.

Spectral radius#

The spectral radius is defined as the maximum eigenvalue of the reservoir matrix.

num_sample = 20
all_radius = [-0.5, 0.5, 1.25, 2.5, 10.]

plt.figure(figsize=(15, len(all_radius) * 3))
for i, s in enumerate(all_radius):
  model = ESN(1, 100, 1, sr=s)
  model.reset_state(1)
  runner = bp.train.DSTrainer(model, monitors={'state': model.r.state})
  _ = runner.predict(x_test[:, :10000])
  states = bm.as_numpy(runner.mon['state'])

  plt.subplot(len(all_radius), 1, i + 1)
  plt.plot(states[0, :, :num_sample])
  plt.ylabel(f"spectral radius=${all_radius[i]}$")
plt.xlabel(f"States ({num_sample} neurons)")
plt.show()
../_images/8b08c09b1ac30c91e88cb5d6b1d9b1ed0f4e26c35e1c7dde752453bdeeb5372d.png
  • spectral radius < 1 \(\rightarrow\) stable dynamics

  • spectral radius > 1 \(\rightarrow\) chaotic dynamics

In most cases, it should have a value around \(1.0\) to ensure the echo state property (ESP): the dynamics of the reservoir should not be bound to the initial state chosen, and remains close to chaos.

This value also heavily depends on the input scaling.

Input scaling#

The input scaling controls how the ESN interact with the inputs. It is a coefficient appliyed to the input matrix \(W^{in}\).

num_sample = 20
all_input_scaling = [0.1, 1.0, 10.0]

plt.figure(figsize=(15, len(all_radius) * 3))
for i, s in enumerate(all_input_scaling):
  model = ESN(1, 100, 1, sr=1., Win_initializer=bp.init.Uniform(max_val=s))
  model.reset_state(1)
  runner = bp.train.DSTrainer(model, monitors={'state': model.r.state})
  _ = runner.predict(x_test[:, :10000])
  states = bm.as_numpy(runner.mon['state'])

  plt.subplot(len(all_radius), 1, i + 1)
  plt.plot(states[0, :, :num_sample])
  plt.ylabel(f"input scaling=${all_radius[i]}$")
plt.xlabel(f"States ({num_sample} neurons)")
plt.show()
../_images/4153ebf4de6a115afa5651a12622ac945c89853d7bfd51924bcf7c7b24abea5b.png

Leaking rate#

The leaking rate (\(\alpha\)) controls the “memory feedback” of the ESN. The ESN states are indeed computed as:

\[ h(t+1) = \underbrace{\color{red}{(1 - \alpha)} h(t)}_{\text{previous states}} + \underbrace{\color{red}\alpha f(x(t+1), h(t))}_{\text{new states}} \]

where \(h\) is the state, \(x\) is the input data, \(f\) is the ESN model function, defined as:

\[ f(x, h) = \tanh(W^{in} \cdotp x + W^{rec} \cdotp h) \]

\(\alpha\) must be in \([0, 1]\).

num_sample = 20
all_rates = [0.001, 0.01, 0.1, 1.]

plt.figure(figsize=(15, len(all_radius) * 3))
for i, s in enumerate(all_rates):
  model = ESN(1, 100, 1, sr=1., leaky_rate=s,
              Win_initializer=bp.init.Uniform(max_val=1.), )
  model.reset_state(1)
  runner = bp.train.DSTrainer(model, monitors={'state': model.r.state})
  _ = runner.predict(x_test[:, :10000])
  states = bm.as_numpy(runner.mon['state'])

  plt.subplot(len(all_radius), 1, i + 1)
  plt.plot(states[0, :, :num_sample])
  plt.ylabel(f"leaky rate=${all_radius[i]}$")
plt.xlabel(f"States ({num_sample} neurons)")
plt.show()
../_images/974eb63e86df26ad5a54f76d7b1dc0650ed98742b3468fbe7196ee5677cfc28c.png

Let’s reduce the input influence to see what is happening inside the reservoir (input scaling set to 0.2):

num_sample = 20
all_rates = [0.001, 0.01, 0.1, 1.]

plt.figure(figsize=(15, len(all_radius) * 3))
for i, s in enumerate(all_rates):
  model = ESN(1, 100, 1, sr=1., leaky_rate=s,
              Win_initializer=bp.init.Uniform(max_val=.2), )
  model.reset_state(1)
  runner = bp.train.DSTrainer(model, monitors={'state': model.r.state})
  _ = runner.predict(x_test[:, :10000])
  states = bm.as_numpy(runner.mon['state'])

  plt.subplot(len(all_radius), 1, i + 1)
  plt.plot(states[0, :, :num_sample])
  plt.ylabel(f"leaky rate=${all_radius[i]}$")
plt.xlabel(f"States ({num_sample} neurons)")
plt.show()
../_images/dca2b9a73d4012d41ed2666c87bf15e934fae087aaebd74d8fa8e03e48527760.png
  • high leaking rate \(\rightarrow\) low inertia, little memory of previous states

  • low leaking rate \(\rightarrow\) high inertia, big memory of previous states

The leaking rate can be seen as the inverse of the reservoir’s time contant.

Task 2: generation of Mackey-Glass timeseries#

Generative mode: the output of ESN will be used as the input.

During this task, the ESN is trained to make a short forecast of the timeserie (1 timestep ahead). Then, it will be asked to run on its own outputs, trying to predict its own behaviour.

# First warmup the reservoir using the first 500 ms
# Then, train the network in 20000 ms to predict 1 ms chaotic series ahead
x_warm, x_train, y_train, x_test, y_test = get_data(500, 1, 20000, sample_rate=int(1/dt))
sample = 300
fig = plt.figure(figsize=(15, 5))
plt.plot(x_train[0, :sample], label="Training data")
plt.plot(y_train[0, :sample], label="True prediction")
plt.legend()
plt.show()
../_images/d69e1f22c5d88dd9e8d19d1fc56d912f2ed12092782331e194c361e355c0288a.png
model = ESN(1, 100, 1, sr=1.1, Win_initializer=bp.init.Uniform(max_val=.2), )
model.reset_state(1)
trainer = bp.train.RidgeTrainer(model, alpha=1e-7)
# warmup
_ = trainer.predict(x_warm)

# train
trainer.fit([x_train, y_train])

# test
ys_predict = trainer.predict(x_train)
start, end = 100, 600
plt.figure(figsize=(15, 7))
plt.subplot(211)
plt.plot(bm.as_numpy(ys_predict)[0, start:end, 0],
         lw=3, label="ESN prediction")
plt.plot(bm.as_numpy(y_train)[0, start:end, 0], linestyle="--",
         lw=2, label="True value")
plt.title(f'Mean Square Error: {bp.losses.mean_squared_error(ys_predict, y_train)}')
plt.legend()
plt.show()
../_images/3f6b7aad7f5180e061d199a29b9a12f39fe936e50e3c550043e6d9829854b3a8.png
jit_model = bm.jit(model)
outputs = [x_test[:, 0]]
truths = [x_test[:, 1]]
for i in range(200):
    outputs.append(jit_model(dict(), outputs[-1]))
    truths.append(x_test[:, i+2])
outputs = bm.asarray(outputs)
truths = bm.asarray(truths)
plt.figure(figsize=(15, 10))
plt.plot(bm.as_numpy(truths).squeeze()[:200], label='Ground truth')
plt.plot(bm.as_numpy(outputs).squeeze()[:200], label='Prediction')
plt.legend()
plt.show()
../_images/be972fdac8f3ced6af04ba8af20b0ad2945f28b4e3b852a4416d043d5070572a.png

References#

  • Jaeger, H.: The “echo state” approach to analysing and training recurrent neural networks. Technical Report GMD Report 148, German National Research Center for Information Technology (2001)

  • Lukoševičius, Mantas. “A Practical Guide to Applying Echo State Networks.” Neural Networks: Tricks of the Trade (2012).