import os
import librosa
import numpy as np
import compiam
from compiam.melody.raga_recognition.deepsrgm.raga_mapping import create_mapping
from compiam.exceptions import (
ModelNotTrainedError,
DatasetNotLoadedError,
)
from compiam.utils import get_logger, stereo_to_mono, WORKDIR
from compiam.utils.download import download_remote_model
logger = get_logger(__name__)
[docs]
class DEEPSRGM(object):
"""DEEPSRGM model for raga classification. This DEEPSGRM implementation has been
kindly provided by Shubham Lohiya and Swarada Bharadwaj.
"""
def __init__(
self,
model_path=None,
download_link=None,
download_checksum=None,
rnn="lstm",
mapping_path=None,
sample_rate=44100,
device=None,
):
"""DEEPSRGM init method.
:param model_path: path to file to the model weights
:param download_link: link to the remote pre-trained model.
:param download_checksum: checksum of the model file.
:param rnn: type of rnn used "lstm" or "gru"
:param mapping_path: path to raga to id JSON mapping
:param sample_rate: sampling rate which the model is trained
:param device: torch CUDA config to route model to GPU
"""
### IMPORTING OPTIONAL DEPENDENCIES
try:
global torch
import torch
global deepsrgmModel
from compiam.melody.raga_recognition.deepsrgm.model import deepsrgmModel
except:
raise ImportError(
"In order to use this tool you need to have torch installed. "
"Install compIAM with torch support: pip install 'compiam[torch]'"
)
###
if not device:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.rnn = rnn
# To prevent CUDNN_STATUS_NOT_INITIALIZED error in case of incompatible GPU
try:
self.model = self._build_model(rnn=self.rnn)
except:
self.device = "cpu"
self.model = self._build_model(rnn=self.rnn)
self.model_path = model_path
self.download_link = download_link
self.download_checksum = download_checksum
self.sample_rate = sample_rate
self.trained = False
## Loading LSTM model by default
if self.model_path is not None:
self.load_model(model_path=self.model_path[self.rnn], rnn=self.rnn)
self.mapping_path = mapping_path
self.selected_ragas = [
5,
8,
10,
13,
17,
20,
22,
23,
24,
28,
] # pre-defined for release 0.1.0
if (mapping_path is not None) and (self.selected_ragas is not None):
self.load_mapping(self.selected_ragas)
self.dataset = None
def _build_model(self, rnn="lstm"):
"""Building DEEPSRM
:param rnn: lstm (default) or gru.
"""
return deepsrgmModel(rnn=rnn).to(self.device)
[docs]
def load_mapping(self, selection=None):
"""Loading raga mapping for DEEPSRGM
:param selection: Selection of ragas for the DEEPSRGM model. A default selection
is initialized by default in compiam v1.0. Flexible selection and training of this
model is under development at this moment and will be available in the next release.
"""
selected_ragas = self.selected_ragas if selection is None else selection
self.mapping = create_mapping(self.mapping_path, selected_ragas)
[docs]
def load_model(self, model_path, rnn="lstm"):
"""Loading weights for DEEPSRGM
:param model_path: path to model.
:param rnn: lstm (default) or gru.
"""
if not os.path.exists(model_path):
self.download_model(model_path)
if rnn == "gru":
self.model = self._build_model(rnn="gru")
self.model_path = model_path
try:
weights = torch.load(model_path, weights_only=True, map_location=self.device)
except:
weights = torch.load(model_path, map_location=self.device)
new_weights = weights.copy()
keys_to_fix = [
".weight_ih_l0",
".weight_hh_l0",
".bias_ih_l0",
".bias_hh_l0",
]
keys_to_fix = [rnn + x for x in keys_to_fix]
for i in keys_to_fix:
new_weights[i.replace(rnn, "rnn")] = weights[i]
del new_weights[i]
self.model.load_state_dict(new_weights)
self.trained = True
[docs]
def download_model(self, model_path=None, force_overwrite=False):
"""Download pre-trained model."""
download_path = (
os.sep + os.path.join(*model_path.split(os.sep)[:-2])
if model_path is not None
else os.path.join(WORKDIR, "models", "melody", "deepsrgm")
)
# Creating model folder to store the weights
if not os.path.exists(download_path):
os.makedirs(download_path)
download_remote_model(
self.download_link,
self.download_checksum,
download_path,
force_overwrite=force_overwrite,
)
[docs]
def load_raga_dataset(self, data_home=None, download=False):
"""Load an instance of the Compmusic raga dataset to assist the tool
:param data_home: path where to store the dataset data
:param download:
"""
self.dataset = compiam.load_dataset(
"compmusic_raga", data_home=data_home, version="default"
)
if download:
self.dataset.download() # Downloads index and features
logger.warning(
f"""
The features are downloaded, but the audio of this dataset is private.
Please request it in this link: https://zenodo.org/records/7278511,
download it, and unzip it in {data_home} following the instructions.
"""
)
[docs]
def get_features(
self,
input_data=None,
input_sr=44100,
pitch_path=None,
tonic_path=None,
from_mirdata=False,
track_id=None,
k=5,
):
"""Computing features for prediction of DEEPSRM
:param input_data: path to audio file or numpy array like audio signal.
:param input_sr: sampling rate of the input array of data (if any). This variable is only
relevant if the input is an array of data instead of a filepath.
:param pitch_path: path to pre-computed pitch file (if available).
:param tonic_path: path to pre-computed tonic file (if available).
:param from_mirdata: boolean to indicate if the features are parsed from the mirdata loader of
Indian Art Music Raga Recognition Dataset (must be specifically this one).
:param track_id: track id for the Indian Art Music Raga Recognition Dataset if from_mirdata is
set to True.
:param k: k indicating the precision of the pitch feature.
"""
if (pitch_path is not None) and (tonic_path is not None):
freqs = open(pitch_path).read().strip().split("\n")
tonic = eval(open(tonic_path).read().strip())
elif from_mirdata:
if self.dataset is None:
raise DatasetNotLoadedError(
"Dataloader is not initialized. Have you run .load_raga_dataset()?"
)
if track_id is None:
raise ValueError(
"To load a track we need a track id. See mirdata instructions \
to know how to list the available ids."
)
track = self.dataset.track(track_id)
pitch_path = track.pitch_post_processed_path
tonic_path = track.tonic_fine_tuned_path
freqs = open(pitch_path).read().strip().split("\n")
tonic = eval(open(tonic_path).read().strip())
else:
try:
import essentia.standard as estd
melodia = compiam.melody.pitch_extraction.Melodia
melodia = melodia(sample_rate=self.sample_rate)
tonic_extraction = (
compiam.melody.tonic_identification.TonicIndianMultiPitch
)
tonic_extraction = tonic_extraction(sample_rate=self.sample_rate)
except:
raise ImportError(
"In order to use these tools to extract the features you need to have essentia installed."
"Install compIAM with essentia support: pip install 'compiam[essentia]'"
)
# Loading and resampling audio
if isinstance(input_data, str):
if not os.path.exists(input_data):
raise FileNotFoundError("Target audio not found.")
audio, _ = librosa.load(input_data, sr=self.sample_rate)
elif isinstance(input_data, np.ndarray):
input_data = stereo_to_mono(input_data)
logger.warning(
f"Resampling... (input sampling rate is assumed {input_sr}Hz, \
make sure this is correct and change input_sr otherwise)"
)
audio = librosa.resample(
input_data, orig_sr=input_sr, target_sr=self.sample_rate
)
else:
raise ValueError("Input must be path to audio signal or an audio array")
logger.info("Extracting pitch track using melodia...")
freqs = melodia.extract(audio)[:, 1]
logger.info("Extracting tonic using multi-pitch approach...")
tonic = tonic_extraction.extract(audio)
# Normalise pitch
feature = np.round(1200 * np.log2(freqs / tonic) * (k / 100)).clip(0)
N = 200
a = []
if len(feature) <= 5000:
raise ValueError(
"""
Audio signal is not longer enough for a proper estimation. Please provide a larger audio.
"""
)
for i in range(N):
c = np.random.randint(0, len(feature) - 5000)
a.append(feature[c : c + 5000])
return np.array(a)
[docs]
def predict(self, features, threshold=0.6, gpu="-1"):
"""Predict raga for recording
:param features: all subsequences for a certain music recording
:param threshold: majority voting threshold
:param gpu: Id of the available GPU to use (-1 by default, to run on CPU)
:return: recognition result
"""
## Setting up GPU if any
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)
if isinstance(features, str):
raise ValueError(
"Please first extract features using .get_features() and use \
these as input for this predict function."
)
# Make sure model is loaded
if self.trained is False:
raise ModelNotTrainedError(
"""
Model is not trained. Please load model before running inference!
You can load the pre-trained instance with the load_model wrapper.
"""
)
# Make sure mapping is loaded
if self.mapping is None:
self.load_mapping(self.selected_ragas)
list_of_ragas = list(self.mapping.values())
# Predict
logger.info(
"Performing prediction for the following {} ragas: {}".format(
len(list_of_ragas), list_of_ragas
)
)
with torch.no_grad():
out = self.model.forward(torch.from_numpy(features).to(self.device).long())
preds = torch.argmax(out, axis=-1)
majority, _ = torch.mode(preds)
majority = int(majority)
votes = float(torch.sum(preds == majority)) / features.shape[0]
if votes >= threshold:
logger.info(
"Input music sample belongs to the {} raga".format(
self.mapping[majority]
)
)
logger.info(
"CONFUSED - Closest raga predicted is {} with {} votes".format(
self.mapping[majority], (votes * 100)
)
)
return self.mapping[majority]