Problems that arise in learning features of audio files

20 Views Asked by At
cough_folder = "C:/Users/User/Desktop/dataset/cough"
nocough_folder = "C:/Users/User/Desktop/dataset/nocough"

n_mfcc = 20
n_mels = 64

def extract_mfcc(file_path, n_mfcc=20):
    signal, sr = librosa.load(file_path, sr=16000)
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)
    return mfccs

def mfcc_to_image(mfcc):
    fig, ax = plt.subplots()
    ax.imshow(mfcc, interpolation='nearest', cmap='gray')
    ax.axis('off')
    fig.canvas.draw()
    
    # RGBA 형태로 데이터 가져오기
    data = np.frombuffer(fig.canvas.tostring_argb(), dtype=np.uint8)
    width, height = fig.canvas.get_width_height()
    
    # 데이터 형태를 (width, height, 4)로 변경 (ARGB 형태)
    data = data.reshape((height, width, 4))
    
    # ARGB에서 RGB로 변환 (알파 채널 제거)
    # matplotlib는 기본적으로 ARGB 형태를 사용하지만, 여기서는 RGB가 필요합니다.
    data = data[..., 1:]  # 첫 번째 채널(알파 채널)을 제거하여 RGB만 남깁니다.
    
    plt.close(fig)  # 생성한 plt 객체를 닫아 리소스를 해제합니다.
    
    # 최종적으로 PIL Image로 변환
    image = Image.fromarray(data)
    return image

class MFCCImageDataset(Dataset):
    def __init__(self, file_paths, labels, transform=None):
        self.file_paths = file_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        mfcc = extract_mfcc(file_path)
        image = mfcc_to_image(mfcc)
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

nocough_files = [os.path.join(nocough_folder, file) for file in os.listdir(nocough_folder)]
cough_files = [os.path.join(cough_folder, file) for file in os.listdir(cough_folder)]
files = nocough_files + cough_files
labels = [0] * len(nocough_files) + [1] * len(cough_files)

# 데이터셋 및 변환 정의
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # 이미지 크기 조정
    transforms.ToTensor(),  # 텐서로 변환
])

# 데이터셋 인스턴스화
dataset = MFCCImageDataset(files, labels, transform=transform)

# 데이터 로더 정의
batch_size = 64
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

from sklearn.model_selection import train_test_split

# 데이터셋을 학습 및 테스트 세트로 분할
X_trainval, X_test, y_trainval, y_test = train_test_split(files, labels, test_size=0.1, random_state=42)

# 학습 및 검증 세트로 다시 분할
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1, random_state=42)

# 각 세트의 크기 출력
print("Training set size:", len(X_train))
print("Validation set size:", len(X_val))
print("Test set size:", len(X_test))


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 학습 및 검증 데이터셋 인스턴스화
train_dataset = MFCCImageDataset(X_train, y_train, transform=transform)
val_dataset = MFCCImageDataset(X_val, y_val, transform=transform)
test_dataset = MFCCImageDataset(X_test, y_test, transform=transform)

# 데이터로더 정의
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

from torchvision.models import resnet50

class ResNetAudio(nn.Module):
    def __init__(self, num_classes):
        super(ResNetAudio, self).__init__()
        self.resnet = resnet50(pretrained=False)  
    
        # 마지막 레이어를 새로운 fully connected 레이어로 대체
        num_ftrs = self.resnet.fc.in_features
        self.resnet.fc = nn.Linear(num_ftrs, num_classes)

    def forward(self, x):
        return self.resnet(x)

# 모델 인스턴스화 및 디바이스 설정
num_classes = 2  # 클래스 수 (기침, 비기침)
model = ResNetAudio(num_classes=num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 옵티마이저 및 손실 함수 설정
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    # 훈련 데이터 로더를 tqdm으로 감싸기
    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch"):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = correct / total
    
    # 검증 데이터셋을 이용하여 정확도 계산
    model.eval()
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for val_images, val_labels in tqdm(val_loader, desc="Validating", unit="batch"):
            val_images, val_labels = val_images.to(device), val_labels.to(device)
            val_outputs = model(val_images)
            _, val_predicted = torch.max(val_outputs, 1)
            val_total += val_labels.size(0)
            val_correct += (val_predicted == val_labels).sum().item()
    val_accuracy = val_correct / val_total
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Training Accuracy: {epoch_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}")

print("Training finished.")

This is a project that extracts mfcc from a voice file and trains it with cnn.

my data is close to 20,000.

There are a few problems.

First, I am curious about the difference between training mfcc as an image and converting it to numpy and training it.

Second, each epoch takes close to 20 minutes. Is this normal?

Third, the validation accuracy seems to zigzag. And the kernel keeps dying on the third and fourth epoch. I would appreciate it if you could review the code to see if there are any problems.

When I wrote the code with numpy, it showed an ideal validation curve, but it showed a zigzag pattern and the kernel kept dying. And it takes too much learning time. Is there any problem with my code?

0

There are 0 best solutions below