View file src/colab/musicgen.py - Download

# -*- coding: utf-8 -*-
"""musicgen.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1WDfiDZBn5sYlxm8E7MeDb9t2PaywUqxK

MUSIC GENERATION WITH MUSICGEN

Links :

- https://github.com/facebookresearch/audiocraft/blob/main/demos/musicgen_demo.ipynb

-  https://colab.research.google.com/drive/1fxGqfg96RBUvGxZ1XXN07s3DthrKUl4-?usp=sharing

- https://colab.research.google.com/corgiredirector?site=https%3A%2F%2Fdocs.openvino.ai%2F2023.0%2Fnotebooks%2F250-music-generation-with-output.html

Welcome to MusicGen's demo jupyter notebook. Here you will find a series of self-contained examples of how to use MusicGen in different settings.

First, we start by initializing MusicGen, you can choose a model from the following selection:

1. facebook/musicgen-small - 300M transformer decoder.

2. facebook/musicgen-medium - 1.5B transformer decoder.

3. facebook/musicgen-melody - 1.5B transformer decoder also supporting melody conditioning.

4. facebook/musicgen-large - 3.3B transformer decoder.

We will use the facebook/musicgen-small variant for the purpose of this demonstration.
"""

!python3 -m pip install -U git+https://github.com/facebookresearch/audiocraft#egg=audiocraft
# !python3 -m pip install -U audiocraft

!python --version
!for p in audiocraft math torchaudio torch locale IPython transformers; do pip list | grep "^$p[ \t]"; done

"""Python 3.10.12

audiocraft                         1.4.0a1
torchaudio                         2.1.0
torch                              2.1.0
torchtext                          0.16.0
transformers                       4.47.1

"""

from audiocraft.models import MusicGen
from audiocraft.models import MultiBandDiffusion

USE_DIFFUSION_DECODER = False
# Using small model, better results would be obtained with `medium` or `large`.
model = MusicGen.get_pretrained('facebook/musicgen-small')
if USE_DIFFUSION_DECODER:
    mbd = MultiBandDiffusion.get_mbd_musicgen()

"""Next, let us configure the generation parameters. Specifically, you can control the following:

- use_sampling (bool, optional): use sampling if True, else do argmax decoding. Defaults to True.
- top_k (int, optional): top_k used for sampling. Defaults to 250.
- top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
- temperature (float, optional): softmax temperature parameter. Defaults to 1.0.
duration (float, optional): duration of the generated waveform. Defaults to 30.0.
- cfg_coef (float, optional): coefficient used for classifier free guidance. Defaults to 3.0.

When left unchanged, MusicGen will revert to its default parameters.

"""

model.set_generation_params(
    use_sampling=True,
    top_k=250,
    duration=30
)

"""Next, we can go ahead and start generating music using one of the following modes:

- Unconditional samples using model.generate_unconditional
- Music continuation using model.generate_continuation
- Text-conditional samples using model.generate
- Melody-conditional samples using model.generate_with_chroma

Music Continuation

Generate a "bip-bip" sound used to start the sample.
"""

import math
import torchaudio
import torch
from audiocraft.utils.notebook import display_audio

def get_bip_bip(bip_duration=0.125, frequency=440,
                duration=0.5, sample_rate=32000, device="cuda"):
    """Generates a series of bip bip at the given frequency."""
    t = torch.arange(
        int(duration * sample_rate), device="cuda", dtype=torch.float) / sample_rate
    wav = torch.cos(2 * math.pi * 440 * t)[None]
    tp = (t % (2 * bip_duration)) / (2 * bip_duration)
    envelope = (tp >= 0.5).float()
    return wav * envelope

bip_bip = get_bip_bip(0.125)
display_audio(bip_bip, 32000)

"""Generate samples in different styles starting with the "bip-bip" sound."""

# Here we use a synthetic signal to prompt both the tonality and the BPM
# of the generated audio.
res = model.generate_continuation(
    get_bip_bip(0.125).expand(2, -1, -1),
    32000, ['Jazz jazz and only jazz',
            'Heartful EDM with beautiful synths and chords'],
    progress=True)
display_audio(res, 32000)

prompt = "ancient chinese music" # @param {type:"string"}
res = model.generate_continuation(
    get_bip_bip(0.125, 440, 0.5).expand(1, -1),
    32000, [prompt],
    progress=True)
display_audio(res, 32000)

"""To load the mp3 file, run one of the two following cells (uncomment it) :

- First cell : upload file directly from github.

- Second cell : upload a file from your computer.

"""

# You can also use any audio from a file. Make sure to trim the file if it is too long!
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!rm bach.mp3*
!wget https://github.com/facebookresearch/audiocraft/raw/main/assets/bach.mp3
!ls
#!ffmpeg -i "bach.mp3" "bach.wav"
#!ls

"""Upload file from here : https://github.com/facebookresearch/audiocraft/blob/main/assets/bach.mp3

or upload your own file.


"""

#from google.colab import drive
#drive.mount('/content/drive')

#from google.colab import files
#uploaded = files.upload()

# prompt_waveform, prompt_sr = torchaudio.load("../assets/bach.mp3")
prompt_waveform, prompt_sr = torchaudio.load("bach.mp3", format="mp3")
display_audio(prompt_waveform, sample_rate=32000)
# prompt_waveform, prompt_sr = librosa.load("bach.mp3", sr=None)
prompt_duration = 2
prompt_waveform = prompt_waveform[..., :int(prompt_duration * prompt_sr)]
output = model.generate_continuation(prompt_waveform, prompt_sample_rate=prompt_sr, progress=True, return_tokens=True)
display_audio(output[0], sample_rate=32000)
if USE_DIFFUSION_DECODER:
    out_diffusion = mbd.tokens_to_wav(output[1])
    display_audio(out_diffusion, sample_rate=32000)

"""Text-conditional Generation

"""

from audiocraft.utils.notebook import display_audio

output = model.generate(
    descriptions=[
        '80s pop track with bassy drums and synth',
        '90s rock song with loud guitars and heavy drums',
        'Progressive rock drum and bass solo',
        'Punk Rock song with loud drum and power guitar',
        'Bluesy guitar instrumental with soulful licks and a driving rhythm section',
        'Jazz Funk song with slap bass and powerful saxophone',
        'drum and bass beat with intense percussions'
    ],
    progress=True, return_tokens=True
)
display_audio(output[0], sample_rate=32000)
if USE_DIFFUSION_DECODER:
    out_diffusion = mbd.tokens_to_wav(output[1])
    display_audio(out_diffusion, sample_rate=32000)

prompt = "80s english new wave" # @param {type:"string"}

output = model.generate(
    descriptions=[
        prompt
    ],
    progress=True, return_tokens=True
)
display_audio(output[0], sample_rate=32000)
if USE_DIFFUSION_DECODER:
    out_diffusion = mbd.tokens_to_wav(output[1])
    display_audio(out_diffusion, sample_rate=32000)

"""Other examples with medium model (from https://colab.research.google.com/drive/1fxGqfg96RBUvGxZ1XXN07s3DthrKUl4-?usp=sharing)"""

model_medium = MusicGen.get_pretrained('medium', device='cuda')
model_medium.set_generation_params(duration=20)

res = model_medium.generate([
    'crazy EDM, heavy bang',
    'classic reggae track with an electronic guitar solo',
    'lofi slow bpm electro chill with organic samples',
    'rock with saturated guitars, a heavy bass line and crazy drum break and fills.',
    'earthy tones, environmentally conscious, ukulele-infused, harmonic, breezy, easygoing, organic instrumentation, gentle grooves',
],
    progress=True)
display_audio(res, 32000)

prompt = "ancient chinese music" # @param {type:"string"}

res = model_medium.generate([
    prompt
],
    progress=True)

display_audio(res, 32000)

"""Other example from https://colab.research.google.com/corgiredirector?site=https%3A%2F%2Fdocs.openvino.ai%2F2023.0%2Fnotebooks%2F250-music-generation-with-output.html"""

from IPython.display import Audio
from transformers import AutoProcessor, MusicgenForConditionalGeneration

# Load the pipeline
model_small = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small", torchscript=True, return_dict=False)

device = "cpu"
sample_length = 15  # seconds

n_tokens = sample_length * model_small.config.audio_encoder.frame_rate + 3
sampling_rate = model_small.config.audio_encoder.sampling_rate
print('Sampling rate is', sampling_rate, 'Hz')

model_small.to(device)
model_small.eval();

processor = AutoProcessor.from_pretrained("facebook/musicgen-small")

inputs = processor(
    text=["80s pop track with bassy drums and synth"],
    return_tensors="pt",
)

audio_values = model_small.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=n_tokens)

Audio(audio_values[0].cpu().numpy(), rate=sampling_rate)

prompt = "80s english new wave" # @param {type:"string"}

inputs = processor(
    text=[prompt],
    return_tensors="pt",
)

audio_values = model_small.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=n_tokens)

Audio(audio_values[0].cpu().numpy(), rate=sampling_rate)

"""Melody-conditional Generation"""

import torchaudio
from audiocraft.utils.notebook import display_audio

model_melody = MusicGen.get_pretrained('facebook/musicgen-melody')
model_melody.set_generation_params(duration=8)

melody_waveform, sr = torchaudio.load("bach.mp3", format="mp3")
display_audio(melody_waveform, sample_rate=32000)

melody_waveform = melody_waveform.unsqueeze(0).repeat(2, 1, 1)
output = model_melody.generate_with_chroma(
    descriptions=[
        '80s pop track with bassy drums and synth',
        '90s rock song with loud guitars and heavy drums',
    ],
    melody_wavs=melody_waveform,
    melody_sample_rate=sr,
    progress=True, return_tokens=True
)
display_audio(output[0], sample_rate=32000)
if USE_DIFFUSION_DECODER:
    out_diffusion = mbd.tokens_to_wav(output[1])
    display_audio(out_diffusion, sample_rate=32000)