GuitarLSTM

Deep learning models for guitar amp/pedal emulation using LSTM with Keras
Log | Files | Refs | README

commit a790c8736525f942abf5487357514447a090a4f3
parent bc0faebae54a13352148a6a1d69171b9f70cd3c3
Author: jmiller656 <joshxmiller656@gmail.com>
Date:   Sat, 30 Jan 2021 23:46:42 -0500

FIx GPU OOM issues

Diffstat:
Mguitar_lstm_colab.ipynb | 511+++++++++++++++++++++++++++++++++++++++----------------------------------------
1 file changed, 254 insertions(+), 257 deletions(-)

diff --git a/guitar_lstm_colab.ipynb b/guitar_lstm_colab.ipynb @@ -1,258 +1,256 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "guitar_lstm_colab.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RF2uyPfxgi8H" + }, + "outputs": [], + "source": [ + "# TO USE: \n", + "# 1. Upload your input and output wav files to the current directory in Colab\n", + "# 2. Edit the USER INPUTS section to point to your wav files, and choose a\n", + "# model name, and number of epochs for training. If you experience \n", + "# crashing due to low RAM, reduce the \"input_size\" parameter, or increase\n", + "# the \"split_data\" parameter.\n", + "# 3. Run each section of code. The trained models and output wav files will be \n", + "# added to the \"models\" directory.\n", + "#\n", + "# Note: Tested on CPU and GPU runtimes.\n", + "\n", + "import tensorflow as tf\n", + "from tensorflow.keras import Sequential\n", + "from tensorflow.keras.layers import LSTM, Conv1D, Dense\n", + "from tensorflow.keras.optimizers import Adam\n", + "from tensorflow.keras.backend import clear_session\n", + "from tensorflow.keras.activations import tanh, elu, relu\n", + "from tensorflow.keras.models import load_model\n", + "import tensorflow.keras.backend as K\n", + "from tensorflow.keras.utils import Sequence\n", + "\n", + "import os\n", + "from scipy import signal\n", + "from scipy.io import wavfile\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import math\n", + "import h5py" + ] }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "id": "RF2uyPfxgi8H" - }, - "source": [ - "# TO USE: \n", - "# 1. Upload your input and output wav files to the current directory in Colab\n", - "# 2. Edit the USER INPUTS section to point to your wav files, and choose a\n", - "# model name, and number of epochs for training. If you experience \n", - "# crashing due to low RAM, reduce the \"input_size\" parameter, or increase\n", - "# the \"split_data\" parameter.\n", - "# 3. Run each section of code. The trained models and output wav files will be \n", - "# added to the \"models\" directory.\n", - "#\n", - "# Note: Tested on CPU and GPU runtimes.\n", - "\n", - "import tensorflow as tf\n", - "from tensorflow.keras import Sequential\n", - "from tensorflow.keras.layers import LSTM, Conv1D, Dense\n", - "from tensorflow.keras.optimizers import Adam\n", - "from tensorflow.keras.backend import clear_session\n", - "from tensorflow.keras.activations import tanh, elu, relu\n", - "from tensorflow.keras.models import load_model\n", - "import tensorflow.keras.backend as K\n", - "from tensorflow.keras.utils import Sequence\n", - "\n", - "import os\n", - "from scipy import signal\n", - "from scipy.io import wavfile\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import math\n", - "import h5py\n", - "\n" - ], - "execution_count": 1, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "U22mDBe4jaf2" - }, - "source": [ - "# EDIT THIS SECTION FOR USER INPUTS\n", - "#\n", - "name = 'test'\n", - "in_file = 'ts9_test1_in_FP32.wav'\n", - "out_file = 'ts9_test1_out_FP32.wav'\n", - "epochs = 1\n", - "split_data=4 # **Increase this to reduce RAM usage **\n", - "\n", - "train_mode = 0 # 0 = speed training, \n", - " # 1 = accuracy training \n", - " # 2 = extended training\n", - "\n", - "input_size = 150 # !!!IMPORTANT !!!: The input_size is set at 150 for Colab notebook. \n", - " # A higher setting may result in crashing due to\n", - " # memory limitation of 8GB for the free version\n", - " # of Colab. This setting limits the accuracy of\n", - " # the training, especially for complex guitar signals\n", - " # such as high distortion.\n", - " # \n", - " # !!!IMPORTANT!!!: You will most likely need to cycle the runtime to \n", - " # free up RAM between training sessions.\n", - " #\n", - " # Increase the \"split_data\" parameter to reduce the RAM used and\n", - " # still allow for a higher \"input_size\" setting. \n", - " #\n", - " # Future dev note: Using a custom dataloader may be a good\n", - " # workaround for this limitation, at the cost\n", - " # of slower training.\n", - "\n", - "if not os.path.exists('models/'+name):\n", - " os.makedirs('models/'+name)\n", - "else:\n", - " print(\"A model with the same name already exists. Please choose a new name.\")\n", - " exit\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "WqI-cGt1jaG2" - }, - "source": [ - "\n", - "def pre_emphasis_filter(x, coeff=0.95):\n", - " return tf.concat([x, x - coeff * x], 1)\n", - " \n", - "def error_to_signal(y_true, y_pred): \n", - " \"\"\"\n", - " Error to signal ratio with pre-emphasis filter:\n", - " \"\"\"\n", - " y_true, y_pred = pre_emphasis_filter(y_true), pre_emphasis_filter(y_pred)\n", - " return K.sum(tf.pow(y_true - y_pred, 2), axis=0) / K.sum(tf.pow(y_true, 2), axis=0) + 1e-10\n", - " \n", - "def save_wav(name, data):\n", - " wavfile.write(name, 44100, data.flatten().astype(np.float32))\n", - "\n", - "def normalize(data):\n", - " data_max = max(data)\n", - " data_min = min(data)\n", - " data_norm = max(data_max,abs(data_min))\n", - " return data / data_norm\n", - "\n", - "\n", - "'''This is a similar Tensorflow/Keras implementation of the LSTM model from the paper:\n", - " \"Real-Time Guitar Amplifier Emulation with Deep Learning\"\n", - " https://www.mdpi.com/2076-3417/10/3/766/htm\n", - "\n", - " Uses a stack of two 1-D Convolutional layers, followed by LSTM, followed by \n", - " a Dense (fully connected) layer. Three preset training modes are available, \n", - " with further customization by editing the code. A Sequential tf.keras model \n", - " is implemented here.\n", - "\n", - " Note: RAM may be a limiting factor for the parameter \"input_size\". The wav data\n", - " is preprocessed and stored in RAM, which improves training speed but quickly runs out\n", - " if using a large number for \"input_size\". Reduce this if you are experiencing\n", - " RAM issues. \n", - " \n", - " --training_mode=0 Speed training (default)\n", - " --training_mode=1 Accuracy training\n", - " --training_mode=2 Extended training (set max_epochs as desired, for example 50+)\n", - "'''\n", - "\n", - "batch_size = 4096 \n", - "test_size = 0.2\n", - "\n", - "if train_mode == 0: # Speed Training\n", - " learning_rate = 0.01 \n", - " conv1d_strides = 12 \n", - " conv1d_filters = 16\n", - " hidden_units = 36\n", - "elif train_mode == 1: # Accuracy Training (~10x longer than Speed Training)\n", - " learning_rate = 0.01 \n", - " conv1d_strides = 4\n", - " conv1d_filters = 36\n", - " hidden_units= 64\n", - "else: # Extended Training (~60x longer than Accuracy Training)\n", - " learning_rate = 0.0005 \n", - " conv1d_strides = 3\n", - " conv1d_filters = 36\n", - " hidden_units= 96\n", - "\n", - "\n", - "# Create Sequential Model ###########################################\n", - "clear_session()\n", - "model = Sequential()\n", - "model.add(Conv1D(conv1d_filters, 12,strides=conv1d_strides, activation=None, padding='same',input_shape=(input_size,1)))\n", - "model.add(Conv1D(conv1d_filters, 12,strides=conv1d_strides, activation=None, padding='same'))\n", - "model.add(LSTM(hidden_units))\n", - "model.add(Dense(1, activation=None))\n", - "model.compile(optimizer=Adam(learning_rate=learning_rate), loss=error_to_signal, metrics=[error_to_signal])\n", - "print(model.summary())\n", - "\n", - "# Load and Preprocess Data ###########################################\n", - "in_rate, in_data = wavfile.read(in_file)\n", - "out_rate, out_data = wavfile.read(out_file)\n", - "\n", - "X_all = in_data.astype(np.float32).flatten() \n", - "X_all = normalize(X_all).reshape(len(X_all),1) \n", - "y_all = out_data.astype(np.float32).flatten() \n", - "y_all = normalize(y_all).reshape(len(y_all),1) \n", - "\n", - "# If splitting the data for training, do this part\n", - "if split_data > 1:\n", - " num_split = len(X_all) // split_data\n", - " X = X_all[0:num_split*split_data]\n", - " y = y_all[0:num_split*split_data]\n", - " X_data = np.split(X, split_data)\n", - " y_data = np.split(y, split_data)\n", - "\n", - " # Perform training on each split dataset\n", - " for i in range(len(X_data)):\n", - " print(\"\\nTraining on split data \" + str(i+1) + \"/\" +str(len(X_data)))\n", - " X_split = X_data[i]\n", - " y_split = y_data[i]\n", - "\n", - " y_ordered = y_split[input_size-1:] \n", - "\n", - " indices = np.arange(input_size) + np.arange(len(X_split)-input_size+1)[:,np.newaxis] \n", - " X_ordered = tf.gather(X_split,indices) \n", - "\n", - " shuffled_indices = np.random.permutation(len(X_ordered)) \n", - " X_random = tf.gather(X_ordered,shuffled_indices)\n", - " y_random = tf.gather(y_ordered, shuffled_indices)\n", - "\n", - " # Train Model ###################################################\n", - " model.fit(X_random,y_random, epochs=epochs, batch_size=batch_size, validation_split=0.2) \n", - "\n", - "\n", - " model.save('models/'+name+'/'+name+'.h5')\n", - "\n", - "# If training on the full set of input data in one run, do this part\n", - "else:\n", - " y_ordered = y_all[input_size-1:] \n", - "\n", - " indices = np.arange(input_size) + np.arange(len(X_all)-input_size+1)[:,np.newaxis] \n", - " X_ordered = tf.gather(X_all,indices) \n", - "\n", - " shuffled_indices = np.random.permutation(len(X_ordered)) \n", - " X_random = tf.gather(X_ordered,shuffled_indices)\n", - " y_random = tf.gather(y_ordered, shuffled_indices)\n", - "\n", - " # Train Model ###################################################\n", - " model.fit(X_random,y_random, epochs=epochs, batch_size=batch_size, validation_split=test_size) \n", - "\n", - " model.save('models/'+name+'/'+name+'.h5')\n", - "\n", - "# Run Prediction #################################################\n", - "print(\"Running prediction..\")\n", - "\n", - "# Get the last 20% of the wav data to run prediction and plot results\n", - "y_the_rest, y_last_part = np.split(y_all, [int(len(y_all)*.8)])\n", - "x_the_rest, x_last_part = np.split(X_all, [int(len(X_all)*.8)])\n", - "y_test = y_last_part[input_size-1:] \n", - "indices = np.arange(input_size) + np.arange(len(x_last_part)-input_size+1)[:,np.newaxis] \n", - "X_test = tf.gather(x_last_part,indices) \n", - "\n", - "prediction = model.predict(X_test, batch_size=batch_size)\n", - "\n", - "save_wav('models/'+name+'/y_pred.wav', prediction)\n", - "save_wav('models/'+name+'/x_test.wav', x_last_part)\n", - "save_wav('models/'+name+'/y_test.wav', y_test)\n", - "\n", - "# Add additional data to the saved model (like input_size)\n", - "filename = 'models/'+name+'/'+name+'.h5'\n", - "f = h5py.File(filename, 'a')\n", - "grp = f.create_group(\"info\")\n", - "dset = grp.create_dataset(\"input_size\", (1,), dtype='int16')\n", - "dset[0] = input_size\n", - "f.close()" - ], - "execution_count": null, - "outputs": [] - } - ] -} -\ No newline at end of file + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "U22mDBe4jaf2" + }, + "outputs": [], + "source": [ + "# EDIT THIS SECTION FOR USER INPUTS\n", + "#\n", + "name = 'test'\n", + "in_file = 'data/ts9_test1_in_FP32.wav'\n", + "out_file = 'data/ts9_test1_out_FP32.wav'\n", + "epochs = 1\n", + "\n", + "train_mode = 0 # 0 = speed training, \n", + " # 1 = accuracy training \n", + " # 2 = extended training\n", + "\n", + "input_size = 150 # !!!IMPORTANT !!!: The input_size is set at 150 for Colab notebook. \n", + " # A higher setting may result in crashing due to\n", + " # memory limitation of 8GB for the free version\n", + " # of Colab. This setting limits the accuracy of\n", + " # the training, especially for complex guitar signals\n", + " # such as high distortion.\n", + " # \n", + " # !!!IMPORTANT!!!: You will most likely need to cycle the runtime to \n", + " # free up RAM between training sessions.\n", + " #\n", + " # Increase the \"split_data\" parameter to reduce the RAM used and\n", + " # still allow for a higher \"input_size\" setting. \n", + " #\n", + " # Future dev note: Using a custom dataloader may be a good\n", + " # workaround for this limitation, at the cost\n", + " # of slower training.\n", + "\n", + "if not os.path.exists('models/'+name):\n", + " os.makedirs('models/'+name)\n", + "else:\n", + " print(\"A model with the same name already exists. Please choose a new name.\")\n", + " exit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WqI-cGt1jaG2" + }, + "outputs": [], + "source": [ + "def pre_emphasis_filter(x, coeff=0.95):\n", + " return tf.concat([x, x - coeff * x], 1)\n", + " \n", + "def error_to_signal(y_true, y_pred): \n", + " \"\"\"\n", + " Error to signal ratio with pre-emphasis filter:\n", + " \"\"\"\n", + " y_true, y_pred = pre_emphasis_filter(y_true), pre_emphasis_filter(y_pred)\n", + " return K.sum(tf.pow(y_true - y_pred, 2), axis=0) / K.sum(tf.pow(y_true, 2), axis=0) + 1e-10\n", + " \n", + "def save_wav(name, data):\n", + " wavfile.write(name, 44100, data.flatten().astype(np.float32))\n", + "\n", + "def normalize(data):\n", + " data_max = max(data)\n", + " data_min = min(data)\n", + " data_norm = max(data_max,abs(data_min))\n", + " return data / data_norm\n", + "\n", + "\n", + "'''This is a similar Tensorflow/Keras implementation of the LSTM model from the paper:\n", + " \"Real-Time Guitar Amplifier Emulation with Deep Learning\"\n", + " https://www.mdpi.com/2076-3417/10/3/766/htm\n", + "\n", + " Uses a stack of two 1-D Convolutional layers, followed by LSTM, followed by \n", + " a Dense (fully connected) layer. Three preset training modes are available, \n", + " with further customization by editing the code. A Sequential tf.keras model \n", + " is implemented here.\n", + "\n", + " Note: RAM may be a limiting factor for the parameter \"input_size\". The wav data\n", + " is preprocessed and stored in RAM, which improves training speed but quickly runs out\n", + " if using a large number for \"input_size\". Reduce this if you are experiencing\n", + " RAM issues. \n", + " \n", + " --training_mode=0 Speed training (default)\n", + " --training_mode=1 Accuracy training\n", + " --training_mode=2 Extended training (set max_epochs as desired, for example 50+)\n", + "'''\n", + "\n", + "batch_size = 4096 \n", + "test_size = 0.2\n", + "\n", + "if train_mode == 0: # Speed Training\n", + " learning_rate = 0.01 \n", + " conv1d_strides = 12 \n", + " conv1d_filters = 16\n", + " hidden_units = 36\n", + "elif train_mode == 1: # Accuracy Training (~10x longer than Speed Training)\n", + " learning_rate = 0.01 \n", + " conv1d_strides = 4\n", + " conv1d_filters = 36\n", + " hidden_units= 64\n", + "else: # Extended Training (~60x longer than Accuracy Training)\n", + " learning_rate = 0.0005 \n", + " conv1d_strides = 3\n", + " conv1d_filters = 36\n", + " hidden_units= 96\n", + "\n", + "\n", + "# Create Sequential Model ###########################################\n", + "clear_session()\n", + "model = Sequential()\n", + "model.add(Conv1D(conv1d_filters, 12,strides=conv1d_strides, activation=None, padding='same',input_shape=(input_size,1)))\n", + "model.add(Conv1D(conv1d_filters, 12,strides=conv1d_strides, activation=None, padding='same'))\n", + "model.add(LSTM(hidden_units))\n", + "model.add(Dense(1, activation=None))\n", + "model.compile(optimizer=Adam(learning_rate=learning_rate), loss=error_to_signal, metrics=[error_to_signal])\n", + "model.summary()\n", + "\n", + "# Load and Preprocess Data ###########################################\n", + "in_rate, in_data = wavfile.read(in_file)\n", + "out_rate, out_data = wavfile.read(out_file)\n", + "\n", + "X_all = in_data.astype(np.float32).flatten() \n", + "X_all = normalize(X_all).reshape(len(X_all),1) \n", + "y_all = out_data.astype(np.float32).flatten() \n", + "y_all = normalize(y_all).reshape(len(y_all),1)\n", + "\n", + "y_ordered = y_all[input_size-1:] \n", + "indices = np.arange(input_size) + np.arange(len(X_all)-input_size+1)[:,np.newaxis] \n", + "x_ordered = np.take(X_all, indices)[:,:, np.newaxis]\n", + "\n", + "# Train Model ###################################################\n", + "model.fit(x_ordered,y_ordered, epochs=epochs, batch_size=batch_size, validation_split=test_size, shuffle=True) \n", + "model.save('models/'+name+'/'+name+'.h5')\n", + "\n", + "# Run Prediction #################################################\n", + "print(\"Running prediction..\")\n", + "\n", + "# Get the last 20% of the wav data to run prediction and plot results\n", + "y_the_rest, y_last_part = np.split(y_all, [int(len(y_all)*.8)])\n", + "x_the_rest, x_last_part = np.split(X_all, [int(len(X_all)*.8)])\n", + "y_test = y_last_part[input_size-1:] \n", + "indices = np.arange(input_size) + np.arange(len(x_last_part)-input_size+1)[:,np.newaxis] \n", + "X_test = np.take(x_last_part,indices)[:, :, np.newaxis]\n", + "\n", + "prediction = model.predict(X_test, batch_size=batch_size)\n", + "\n", + "save_wav('models/'+name+'/y_pred.wav', prediction)\n", + "save_wav('models/'+name+'/x_test.wav', x_last_part)\n", + "save_wav('models/'+name+'/y_test.wav', y_test)\n", + "\n", + "# Add additional data to the saved model (like input_size)\n", + "filename = 'models/'+name+'/'+name+'.h5'\n", + "f = h5py.File(filename, 'a')\n", + "grp = f.create_group(\"info\")\n", + "dset = grp.create_dataset(\"input_size\", (1,), dtype='int16')\n", + "dset[0] = input_size\n", + "f.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import IPython.display as ipd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ipd.Audio('models/'+name+'/y_pred.wav')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "guitar_lstm_colab.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}