Source code for compiam.melody.pitch_extraction.ftanet_carnatic

import os
import math
import librosa

import numpy as np
from compiam.exceptions import ModelNotTrainedError

from compiam.utils.pitch import normalisation, resampling
from compiam.utils.download import download_remote_model
from compiam.melody.pitch_extraction.ftanet_carnatic.pitch_processing import (
    batchize_test,
    get_est_arr,
)
from compiam.melody.pitch_extraction.ftanet_carnatic.cfp import cfp_process
from compiam.io import write_csv
from compiam.utils import get_logger, WORKDIR

logger = get_logger(__name__)


[docs] class FTANetCarnatic(object): """FTA-Net melody extraction tuned to Carnatic Music.""" def __init__( self, model_path=None, download_link=None, download_checksum=None, sample_rate=8000, gpu="-1", ): """FTA-Net melody extraction init method. :param model_path: path to file to the model weights. :param download_link: link to the remote pre-trained model. :param download_checksum: checksum of the model file. :param sample_rate: Sample rate to which the audio is sampled for extraction. :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. """ ### IMPORTING OPTIONAL DEPENDENCIES try: global tf import tensorflow as tf except: raise ImportError( "In order to use this tool you need to have tensorflow installed. " "Install compIAM with tensorflow support: pip install 'compiam[tensorflow]'" ) ### ## Setting up GPU if specified os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.gpu = gpu self.model = self._build_model() self.sample_rate = sample_rate self.trained = False self.model_path = model_path self.download_link = download_link self.download_checksum = download_checksum if self.model_path is not None: self.load_model(self.model_path)
[docs] @staticmethod def SF_Module(x_list, n_channel, reduction, limitation): """Selection and fusion module. Implementation taken from https://github.com/yushuai/FTANet-melodic :param x_list: list of tensor inputs. :param n_channel: number of feature channels. :param reduction: the rate to which the data is compressed. :param limitation: setting a compressing limit. :returns: a tensor with the fused and selected feature map. """ ## Split fused = None for x_s in x_list: if fused == None: fused = x_s else: fused = tf.keras.layers.Add()([fused, x_s]) ## Fuse fused = tf.keras.layers.GlobalAveragePooling2D()(fused) fused = tf.keras.layers.BatchNormalization()(fused) fused = tf.keras.layers.Dense( max(n_channel // reduction, limitation), activation="selu" )(fused) ## Select masks = [] for i in range(len(x_list)): masks.append(tf.keras.layers.Dense(n_channel)(fused)) mask_stack = tf.keras.layers.Lambda( tf.keras.backend.stack, arguments={"axis": -1} )(masks) # (n_channel, n_kernel) mask_stack = tf.keras.layers.Softmax(axis=-2)(mask_stack) selected = None for i, x_s in enumerate(x_list): mask = tf.keras.layers.Lambda(lambda z: z[:, :, i])(mask_stack) mask = tf.keras.layers.Reshape((1, 1, n_channel))(mask) x_s = tf.keras.layers.Multiply()([x_s, mask]) if selected == None: selected = x_s else: selected = tf.keras.layers.Add()([selected, x_s]) return selected
[docs] @staticmethod def FTA_Module(x, shape, kt, kf): """Selection and fusion module. Implementation taken from https://github.com/yushuai/FTANet-melodic :param x: input tensor. :param shape: the shape of the input tensor. :param kt: kernel size for time attention. :param kf: kernel size for frequency attention. :returns: the resized input, the time-attention map, and the frequency-attention map. """ x = tf.keras.layers.BatchNormalization()(x) ## Residual x_r = tf.keras.layers.Conv2D( shape[2], (1, 1), padding="same", activation="relu" )(x) ## Time Attention # Attn Map (1, T, C), FC a_t = tf.keras.layers.Lambda(tf.keras.backend.mean, arguments={"axis": -3})(x) a_t = tf.keras.layers.Conv1D(shape[2], kt, padding="same", activation="selu")( a_t ) a_t = tf.keras.layers.Conv1D(shape[2], kt, padding="same", activation="selu")( a_t ) # 2 a_t = tf.keras.layers.Softmax(axis=-2)(a_t) a_t = tf.keras.layers.Reshape((1, shape[1], shape[2]))(a_t) # Reweight x_t = tf.keras.layers.Conv2D( shape[2], (3, 3), padding="same", activation="selu" )(x) x_t = tf.keras.layers.Conv2D( shape[2], (5, 5), padding="same", activation="selu" )(x_t) x_t = tf.keras.layers.Multiply()([x_t, a_t]) # Frequency Attention # Attn Map (F, 1, C), Conv1D a_f = tf.keras.layers.Lambda(tf.keras.backend.mean, arguments={"axis": -2})(x) a_f = tf.keras.layers.Conv1D(shape[2], kf, padding="same", activation="selu")( a_f ) a_f = tf.keras.layers.Conv1D(shape[2], kf, padding="same", activation="selu")( a_f ) a_f = tf.keras.layers.Softmax(axis=-2)(a_f) a_f = tf.keras.layers.Reshape((shape[0], 1, shape[2]))(a_f) # Reweight x_f = tf.keras.layers.Conv2D( shape[2], (3, 3), padding="same", activation="selu" )(x) x_f = tf.keras.layers.Conv2D( shape[2], (5, 5), padding="same", activation="selu" )(x_f) x_f = tf.keras.layers.Multiply()([x_f, a_f]) return x_r, x_t, x_f
def _build_model(self, input_shape=(320, 128, 3)): """Building the entire FTA-Net. Implementation taken from https://github.com/yushuai/FTANet-melodic :param input_shape: input shape. :returns: a tensorflow Model instance of the FTA-Net. """ visible = tf.keras.layers.Input(shape=input_shape) x = tf.keras.layers.BatchNormalization()(visible) ## Bottom # bm = BatchNormalization()(x) bm = x bm = tf.keras.layers.Conv2D( 16, (4, 1), padding="valid", strides=(4, 1), activation="selu" )( bm ) # 80 bm = tf.keras.layers.Conv2D( 16, (4, 1), padding="valid", strides=(4, 1), activation="selu" )( bm ) # 20 bm = tf.keras.layers.Conv2D( 16, (4, 1), padding="valid", strides=(4, 1), activation="selu" )( bm ) # 5 bm = tf.keras.layers.Conv2D( 1, (5, 1), padding="valid", strides=(5, 1), activation="selu" )( bm ) # 1 shape = input_shape x_r, x_t, x_f = self.FTA_Module(x, (shape[0], shape[1], 32), 3, 3) x = self.SF_Module([x_r, x_t, x_f], 32, 4, 4) x = tf.keras.layers.MaxPooling2D((2, 2))(x) x_r, x_t, x_f = self.FTA_Module(x, (shape[0] // 2, shape[1] // 2, 64), 3, 3) x = self.SF_Module([x_r, x_t, x_f], 64, 4, 4) x = tf.keras.layers.MaxPooling2D((2, 2))(x) x_r, x_t, x_f = self.FTA_Module(x, (shape[0] // 4, shape[1] // 4, 128), 3, 3) x = self.SF_Module([x_r, x_t, x_f], 128, 4, 4) x_r, x_t, x_f = self.FTA_Module(x, (shape[0] // 4, shape[1] // 4, 128), 3, 3) x = self.SF_Module([x_r, x_t, x_f], 128, 4, 4) x = tf.keras.layers.UpSampling2D((2, 2))(x) x_r, x_t, x_f = self.FTA_Module(x, (shape[0] // 2, shape[1] // 2, 64), 3, 3) x = self.SF_Module([x_r, x_t, x_f], 64, 4, 4) x = tf.keras.layers.UpSampling2D((2, 2))(x) x_r, x_t, x_f = self.FTA_Module(x, (shape[0], shape[1], 32), 3, 3) x = self.SF_Module([x_r, x_t, x_f], 32, 4, 4) x_r, x_t, x_f = self.FTA_Module(x, (shape[0], shape[1], 1), 3, 3) x = self.SF_Module([x_r, x_t, x_f], 1, 4, 4) x = tf.keras.layers.Concatenate(axis=1)([bm, x]) # Softmax x = tf.keras.layers.Lambda(tf.keras.backend.squeeze, arguments={"axis": -1})(x) x = tf.keras.layers.Softmax(axis=-2)(x) return tf.keras.models.Model(inputs=visible, outputs=x) def load_model(self, model_path): if ".data-00000-of-00001" not in model_path: path_to_check = model_path + ".data-00000-of-00001" if not os.path.exists(path_to_check): self.download_model(model_path) # Downloading model weights self.model.load_weights(model_path).expect_partial() self.model_path = model_path self.trained = True
[docs] def download_model(self, model_path=None, force_overwrite=False): """Download pre-trained model.""" download_path = ( os.sep + os.path.join(*model_path.split(os.sep)[:-2]) if model_path is not None else os.path.join(WORKDIR, "models", "melody", "ftanet-carnatic") ) # Creating model folder to store the weights if not os.path.exists(download_path): os.makedirs(download_path) download_remote_model( self.download_link, self.download_checksum, download_path, force_overwrite=force_overwrite, )
[docs] def predict( self, input_data, input_sr=44100, hop_size=80, batch_size=5, out_step=None, gpu="-1", ): """Extract melody from input_data. Implementation taken (and slightly adapted) from https://github.com/yushuai/FTANet-melodic. :param input_data: path to audio file or numpy array like audio signal. :param input_sr: sampling rate of the input array of data (if any). This variable is only relevant if the input is an array of data instead of a filepath. :param hop_size: hop size between frequency estimations. :param batch_size: batches of seconds that are passed through the model (defaulted to 5, increase if enough computational power, reduce if needed). :param out_step: particular time-step duration if needed at output :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. :returns: a 2-D list with time-stamps and pitch values per timestamp. """ ## Setting up GPU if any if gpu != self.gpu: os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.gpu = gpu if self.trained is False: raise ModelNotTrainedError( """Model is not trained. Please load model before running inference! You can load the pre-trained instance with the load_model wrapper.""" ) # Loading and resampling audio if isinstance(input_data, str): if not os.path.exists(input_data): raise FileNotFoundError("Target audio not found.") audio, _ = librosa.load(input_data, sr=self.sample_rate) elif isinstance(input_data, np.ndarray): logger.warning( f"Resampling... (input sampling rate is assumed {input_sr}Hz, \ make sure this is correct and change input_sr otherwise)" ) audio = librosa.resample( input_data, orig_sr=input_sr, target_sr=self.sample_rate ) else: raise ValueError("Input must be path to audio signal or an audio array") audio_shape = audio.shape if len(audio_shape) > 1: audio_channels = min(audio_shape) if audio_channels == 1: audio = audio.flatten() else: audio = np.mean(audio, axis=np.argmin(audio_shape)) xlist = [] timestamps = [] audio_len = len(audio) batch_min = self.sample_rate * 60 * batch_size freqs = [] if audio_len > batch_min: iters = math.ceil(audio_len / batch_min) for i in np.arange(iters): if i < iters - 1: audio_in = audio[batch_min * i : batch_min * (i + 1)] if i == iters - 1: audio_in = audio[batch_min * i :] feature, _, time_arr = cfp_process( audio_in, sr=self.sample_rate, hop=hop_size ) data = batchize_test(feature, size=128) xlist.append(data) timestamps.append(time_arr) estimation = get_est_arr(self.model, xlist, timestamps, batch_size=16) if i == 0: freqs = estimation[:, 1] else: freqs = np.concatenate((freqs, estimation[:, 1])) else: feature, _, time_arr = cfp_process(audio, sr=self.sample_rate, hop=hop_size) data = batchize_test(feature, size=128) xlist.append(data) timestamps.append(time_arr) # Getting estimatted pitch estimation = get_est_arr(self.model, xlist, timestamps, batch_size=16) freqs = estimation[:, 1] TStamps = np.linspace(0, audio_len / self.sample_rate, len(freqs)) freqs[freqs < 50] = 0 output = np.array([TStamps, freqs]).transpose() if out_step is not None: new_len = int((audio_len / self.sample_rate) // out_step) return resampling(output, new_len) return output
[docs] @staticmethod def normalise_pitch(pitch, tonic, bins_per_octave=120, max_value=4): """Normalise pitch given a tonic. :param pitch: a 2-D list with time-stamps and pitch values per timestamp. :param tonic: recording tonic to normalize the pitch to. :param bins_per_octave: number of frequency bins per octave. :param max_value: maximum value to clip the normalized pitch to. :returns: a 2-D list with time-stamps and normalised to a given tonic pitch values per timestamp. """ return normalisation( pitch, tonic, bins_per_octave=bins_per_octave, max_value=max_value )
[docs] @staticmethod def save_pitch(data, output_path): """Calling the write_csv function in compiam.io to write the output pitch curve in a fle :param data: the data to write :param output_path: the path where the data is going to be stored :returns: None """ return write_csv(data, output_path)