Im tryng to write a python class with methods that let me edit audio, the method in question is an attempt to pitch shift with a phase vocalizer, i know theres a librosa vocalizer and many others, but i would like to do it myself. the vocalizer works when there is zero pitch shit, but once i try to shift it it becomes choppy and weird. Im trying to use the method from Jentgent, seen here https://github.com/JentGent/pitch-shift/blob/main/audios.ipynb.
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
from scipy.io.wavfile import write
import scipy.signal
import IPython
from IPython.display import Audio
class Audio_object:
def __init__(self, file, mono=True):
self.samplerate, self.data = wavfile.read(file)
if mono and self.data.ndim > 1:
self.data = np.mean(self.data, axis=1).astype(self.data.dtype)
def writefile(self, name):
filename = name + ".wav"
write(filename, self.samplerate, self.data.astype(np.int16))
def stft(self, window='hann', seg_ratio= 0.1, overlap=0.5):
nperseg = self.samplerate*seg_ratio
f, t, Zxx = scipy.signal.stft(self.data, fs=self.samplerate, window=window, nperseg=nperseg, noverlap=overlap)
return f, t, Zxx
def inverse_stft(self, Zxx, window='hann',seg_ratio= 0.1, overlap = 0.5):
nperseg = self.samplerate*seg_ratio
_, x_rec = scipy.signal.istft(Zxx, fs=self.samplerate, window=window, nperseg=nperseg, noverlap=overlap)
return x_rec
def circle(self, name):
filename = name + ".wav"
_, _, Zxx = self.stft()
x_rec = self.inverse_stft(Zxx)
write(filename, self.samplerate, x_rec.astype(np.int16))
def plot(self, data,title):
plt.plot(data)
plt.xlabel("Sample Index")
plt.ylabel("Amplitude")
plt.title(title)
def plot_fft(self):
f, _, Zxx = self.stft()
avg_spectrum = np.mean(np.abs(Zxx), axis=1) # taking the mean of the magnitude across the spectrum,
plt.figure(figsize=(10, 5)) # this is not done for further use, only for visual representation
plt.plot(f, avg_spectrum)
plt.title('Average FFT Magnitude Spectrum')
plt.xlabel('Frequency [Hz]')
plt.ylabel('Magnitude')
plt.grid(True)
plt.show()
def plot_spectrogram(self, f, t, Zxx):
plt.figure(figsize=(10, 5))
plt.pcolormesh(t, f, np.abs(Zxx), shading='gouraud')
plt.title('Spectrogram')
plt.xlabel('Time [s]')
plt.ylabel('Frequency [Hz]')
plt.ylim(0,3500)
plt.show()
@staticmethod
def interpolate_time(idxs, arr):
start = (idxs + 0.5).astype(int)
frac = (idxs - start)[None, None, :]
shifted_arr = np.concatenate((arr[:, 1:], np.zeros((arr.shape[0], arr.shape[1]))), axis=1)
return arr[:, start] * (1 - frac) + shifted_arr[:, :, start] * frac
@staticmethod
def interpolate_time(idxs, arr):
start = np.minimum((idxs + 0.5).astype(int), arr.shape[1]-1)
frac = (idxs - start)[None, None, :]
shifted_arr = np.concatenate((arr[:, 1:], np.zeros((arr.shape[0], arr.shape[1]))), axis=1)
return arr[:, start] * (1 - frac) + shifted_arr[ :, start] * frac
def pitch_shift(self, semitones, seg_ratio=0.1, overlap=0.5):
scaling = 2 ** (semitones / 12)
f,t,Zxx = self.stft()
anls_frames = np.arange(len(Zxx))
n_synth_frames = np.floor(len(Zxx) * scaling).astype(int)
synth_frames = np.arange(n_synth_frames)
og_idxs = np.minimum(synth_frames / scaling,len(Zxx) - 1)
mags = np.abs(Zxx)
phases = np.angle(Zxx)
#print((np.zeros((len(Zxx), 1)), phases[:, :-1]))
phase_diffs = phases - np.concatenate((np.zeros(( len(Zxx), 1)), phases[:, :-1]), axis=1)
phase_diffs = np.mod(phase_diffs, np.pi * 2)
shifted_mags = self.interpolate_time(og_idxs, mags)
shifted_phase_diffs = self.interpolate_time(og_idxs, phase_diffs)
shifted_phases = np.cumsum(shifted_phase_diffs, axis=2)
synth_stft = shifted_mags * np.exp(shifted_phases * 1j)
new_waveform = self.inverse_stft(synth_stft.astype(np.complex64))
return Audio(new_waveform.astype(np.int16), rate=int(self.samplerate*scaling))
gettysburg = Audio_object("gettysburg.wav", mono=True)
audio_widget = gettysburg.pitch_shift(12)
audio_widget
Ive been trying to mess around with variables, but the stft arrays are a little too complex for me