Source code for compiam.melody.raga_recognition.deepsrgm

import os
import librosa

import numpy as np

import compiam
from compiam.melody.raga_recognition.deepsrgm.raga_mapping import create_mapping
from compiam.exceptions import (
    ModelNotTrainedError,
    DatasetNotLoadedError,
)
from compiam.utils import get_logger, stereo_to_mono, WORKDIR
from compiam.utils.download import download_remote_model

logger = get_logger(__name__)



[docs]
class DEEPSRGM(object):
    """DEEPSRGM model for raga classification. This DEEPSGRM implementation has been
    kindly provided by Shubham Lohiya and Swarada Bharadwaj.
    """

    def __init__(
        self,
        model_path=None,
        download_link=None,
        download_checksum=None,
        rnn="lstm",
        mapping_path=None,
        sample_rate=44100,
        device=None,
    ):
        """DEEPSRGM init method.

        :param model_path: path to file to the model weights
        :param download_link: link to the remote pre-trained model.
        :param download_checksum: checksum of the model file.
        :param rnn: type of rnn used "lstm" or "gru"
        :param mapping_path: path to raga to id JSON mapping
        :param sample_rate: sampling rate which the model is trained
        :param device: torch CUDA config to route model to GPU
        """
        ### IMPORTING OPTIONAL DEPENDENCIES
        try:
            global torch
            import torch

            global deepsrgmModel
            from compiam.melody.raga_recognition.deepsrgm.model import deepsrgmModel
        except:
            raise ImportError(
                "In order to use this tool you need to have torch installed. "
                "Install compIAM with torch support: pip install 'compiam[torch]'"
            )
        ###

        if not device:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.rnn = rnn
        # To prevent CUDNN_STATUS_NOT_INITIALIZED error in case of incompatible GPU
        try:
            self.model = self._build_model(rnn=self.rnn)
        except:
            self.device = "cpu"
            self.model = self._build_model(rnn=self.rnn)

        self.model_path = model_path
        self.download_link = download_link
        self.download_checksum = download_checksum
        self.sample_rate = sample_rate
        self.trained = False

        ## Loading LSTM model by default
        if self.model_path is not None:
            self.load_model(model_path=self.model_path[self.rnn], rnn=self.rnn)

        self.mapping_path = mapping_path
        self.selected_ragas = [
            5,
            8,
            10,
            13,
            17,
            20,
            22,
            23,
            24,
            28,
        ]  # pre-defined for release 0.1.0

        if (mapping_path is not None) and (self.selected_ragas is not None):
            self.load_mapping(self.selected_ragas)
        self.dataset = None

    def _build_model(self, rnn="lstm"):
        """Building DEEPSRM

        :param rnn: lstm (default) or gru.
        """
        return deepsrgmModel(rnn=rnn).to(self.device)


[docs]
    def load_mapping(self, selection=None):
        """Loading raga mapping for DEEPSRGM

        :param selection: Selection of ragas for the DEEPSRGM model. A default selection
            is initialized by default in compiam v1.0. Flexible selection and training of this
            model is under development at this moment and will be available in the next release.
        """
        selected_ragas = self.selected_ragas if selection is None else selection
        self.mapping = create_mapping(self.mapping_path, selected_ragas)



[docs]
    def load_model(self, model_path, rnn="lstm"):
        """Loading weights for DEEPSRGM

        :param model_path: path to model.
        :param rnn: lstm (default) or gru.
        """
        if not os.path.exists(model_path):
            self.download_model(model_path)

        if rnn == "gru":
            self.model = self._build_model(rnn="gru")

        self.model_path = model_path
        try:
            weights = torch.load(model_path, weights_only=True, map_location=self.device)
        except:
            weights = torch.load(model_path, map_location=self.device)
        new_weights = weights.copy()
        keys_to_fix = [
            ".weight_ih_l0",
            ".weight_hh_l0",
            ".bias_ih_l0",
            ".bias_hh_l0",
        ]
        keys_to_fix = [rnn + x for x in keys_to_fix]
        for i in keys_to_fix:
            new_weights[i.replace(rnn, "rnn")] = weights[i]
            del new_weights[i]
        self.model.load_state_dict(new_weights)
        self.trained = True



[docs]
    def download_model(self, model_path=None, force_overwrite=False):
        """Download pre-trained model."""
        download_path = (
            os.sep + os.path.join(*model_path.split(os.sep)[:-2])
            if model_path is not None
            else os.path.join(WORKDIR, "models", "melody", "deepsrgm")
        )
        # Creating model folder to store the weights
        if not os.path.exists(download_path):
            os.makedirs(download_path)
        download_remote_model(
            self.download_link,
            self.download_checksum,
            download_path,
            force_overwrite=force_overwrite,
        )



[docs]
    def load_raga_dataset(self, data_home=None, download=False):
        """Load an instance of the Compmusic raga dataset to assist the tool

        :param data_home: path where to store the dataset data
        :param download:
        """
        self.dataset = compiam.load_dataset(
            "compmusic_raga", data_home=data_home, version="default"
        )
        if download:
            self.dataset.download()  # Downloads index and features
            logger.warning(
                f"""
                The features are downloaded, but the audio of this dataset is private. 
                Please request it in this link: https://zenodo.org/records/7278511, 
                download it, and unzip it in {data_home} following the instructions.
            """
            )



[docs]
    def get_features(
        self,
        input_data=None,
        input_sr=44100,
        pitch_path=None,
        tonic_path=None,
        from_mirdata=False,
        track_id=None,
        k=5,
    ):
        """Computing features for prediction of DEEPSRM

        :param input_data: path to audio file or numpy array like audio signal.
        :param input_sr: sampling rate of the input array of data (if any). This variable is only
            relevant if the input is an array of data instead of a filepath.
        :param pitch_path: path to pre-computed pitch file (if available).
        :param tonic_path: path to pre-computed tonic file (if available).
        :param from_mirdata: boolean to indicate if the features are parsed from the mirdata loader of
            Indian Art Music Raga Recognition Dataset (must be specifically this one).
        :param track_id: track id for the Indian Art Music Raga Recognition Dataset if from_mirdata is
            set to True.
        :param k: k indicating the precision of the pitch feature.
        """
        if (pitch_path is not None) and (tonic_path is not None):
            freqs = open(pitch_path).read().strip().split("\n")
            tonic = eval(open(tonic_path).read().strip())

        elif from_mirdata:
            if self.dataset is None:
                raise DatasetNotLoadedError(
                    "Dataloader is not initialized. Have you run .load_raga_dataset()?"
                )
            if track_id is None:
                raise ValueError(
                    "To load a track we need a track id. See mirdata instructions \
                    to know how to list the available ids."
                )
            track = self.dataset.track(track_id)
            pitch_path = track.pitch_post_processed_path
            tonic_path = track.tonic_fine_tuned_path
            freqs = open(pitch_path).read().strip().split("\n")
            tonic = eval(open(tonic_path).read().strip())

        else:
            try:
                import essentia.standard as estd

                melodia = compiam.melody.pitch_extraction.Melodia
                melodia = melodia(sample_rate=self.sample_rate)
                tonic_extraction = (
                    compiam.melody.tonic_identification.TonicIndianMultiPitch
                )
                tonic_extraction = tonic_extraction(sample_rate=self.sample_rate)
            except:
                raise ImportError(
                    "In order to use these tools to extract the features you need to have essentia installed."
                    "Install compIAM with essentia support: pip install 'compiam[essentia]'"
                )

            # Loading and resampling audio
            if isinstance(input_data, str):
                if not os.path.exists(input_data):
                    raise FileNotFoundError("Target audio not found.")
                audio, _ = librosa.load(input_data, sr=self.sample_rate)
            elif isinstance(input_data, np.ndarray):
                input_data = stereo_to_mono(input_data)
                logger.warning(
                    f"Resampling... (input sampling rate is assumed {input_sr}Hz, \
                        make sure this is correct and change input_sr otherwise)"
                )
                audio = librosa.resample(
                    input_data, orig_sr=input_sr, target_sr=self.sample_rate
                )
            else:
                raise ValueError("Input must be path to audio signal or an audio array")

            logger.info("Extracting pitch track using melodia...")
            freqs = melodia.extract(audio)[:, 1]

            logger.info("Extracting tonic using multi-pitch approach...")
            tonic = tonic_extraction.extract(audio)

        # Normalise pitch
        feature = np.round(1200 * np.log2(freqs / tonic) * (k / 100)).clip(0)
        N = 200
        a = []
        if len(feature) <= 5000:
            raise ValueError(
                """
                Audio signal is not longer enough for a proper estimation. Please provide a larger audio.
            """
            )
        for i in range(N):
            c = np.random.randint(0, len(feature) - 5000)
            a.append(feature[c : c + 5000])
        return np.array(a)



[docs]
    def predict(self, features, threshold=0.6, gpu="-1"):
        """Predict raga for recording

        :param features: all subsequences for a certain music recording
        :param threshold: majority voting threshold
        :param gpu: Id of the available GPU to use (-1 by default, to run on CPU)
        :return: recognition result
        """
        ## Setting up GPU if any
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        if isinstance(features, str):
            raise ValueError(
                "Please first extract features using .get_features() and use \
                these as input for this predict function."
            )

        # Make sure model is loaded
        if self.trained is False:
            raise ModelNotTrainedError(
                """
                Model is not trained. Please load model before running inference!
                You can load the pre-trained instance with the load_model wrapper.
            """
            )

        # Make sure mapping is loaded
        if self.mapping is None:
            self.load_mapping(self.selected_ragas)
        list_of_ragas = list(self.mapping.values())

        # Predict
        logger.info(
            "Performing prediction for the following {} ragas: {}".format(
                len(list_of_ragas), list_of_ragas
            )
        )
        with torch.no_grad():
            out = self.model.forward(torch.from_numpy(features).to(self.device).long())
        preds = torch.argmax(out, axis=-1)
        majority, _ = torch.mode(preds)
        majority = int(majority)
        votes = float(torch.sum(preds == majority)) / features.shape[0]
        if votes >= threshold:
            logger.info(
                "Input music sample belongs to the {} raga".format(
                    self.mapping[majority]
                )
            )
        logger.info(
            "CONFUSED - Closest raga predicted is {} with {} votes".format(
                self.mapping[majority], (votes * 100)
            )
        )
        return self.mapping[majority]