0

I am looking to create a speech recognition model from scratch without using an existing model. I have already used Whisper successfully but I need to create a model that I can train myself whose performance can be measured against whisper.

Is there any resource that I can follow that will guide about how I can create a end to end speech recognizer?

So far, I have followed some code that I could find online and currently it does not give any outputs. I even tried to overfit it to a single file and test it on the same file but it does not seem to be giving any outputs. I have used Mozilla common voice dataset and converted the mp3 files to .wav files at a 16000 hz sample rate

import os
import string
import pandas as pd
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as Fn
from torch.utils.data import Dataset, DataLoader, random_split
# ---------- Feature Extraction ----------
mel_spec = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000,
    n_fft=400,
    win_length=400,
    hop_length=160,
    n_mels=80
)
amp_to_db = torchaudio.transforms.AmplitudeToDB()

def extract_features(wav_path):
    waveform, sr = torchaudio.load(wav_path)
    if sr != 16000:
        waveform = torchaudio.functional.resample(waveform, sr, 16000)
    spec = mel_spec(waveform)
    log_spec = amp_to_db(spec)
    # optional: remove normalization if unstable
    # log_spec = (log_spec - log_spec.mean()) / (log_spec.std() + 1e-8)
    return log_spec.squeeze(0).transpose(0, 1)  # (T, 80)

# ---------- Vocabulary & Tokenizer ----------
english_chars = list(string.ascii_lowercase + " '")
vocab = ['<blank>'] + sorted(set(english_chars))
char2idx = {c: i for i, c in enumerate(vocab)}
idx2char = {i: c for c, i in char2idx.items()}

def text_to_indices(text):
    return [char2idx[c] for c in text.lower() if c in char2idx]

def collapse_tokens(pred_ids, idx2char):
    tokens = []
    prev = None
    for p in pred_ids:
        if p != prev and p != 0:
            tokens.append(idx2char[p])
        prev = p
    return ''.join(tokens)

# ---------- Dataset ----------
class SpeechDataset(Dataset):
    def __init__(self, df: pd.DataFrame, wav_dir: str):
        df_new = df.sample(400)
        self.wavs = df_new['clip'].tolist()
        self.texts = df_new['sentence'].tolist()
        self.wav_dir = wav_dir

    def __len__(self):
        return len(self.wavs)

    def __getitem__(self, idx):
        wav_path = os.path.join(self.wav_dir, self.wavs[idx])
        spec = extract_features(wav_path)
        labels = torch.tensor(text_to_indices(self.texts[idx]), dtype=torch.long)
        return spec, labels


# ---------- Collate Function with Debug ----------
def collate_fn(batch):
    specs, labels = zip(*batch)
    spec_lens = [s.size(0) for s in specs]
    label_lens = [len(l) for l in labels]
    print("▶ batch label lengths:", label_lens[:5])

    max_spec = max(spec_lens)
    max_lab = max(label_lens)
    batch_spec = torch.zeros(len(batch), max_spec, 80)
    batch_lab = torch.zeros(len(batch), max_lab, dtype=torch.long)
    for i, (s, l) in enumerate(zip(specs, labels)):
        batch_spec[i, :s.size(0), :] = s
        batch_lab[i, :l.size(0)] = l
    return batch_spec, torch.tensor(spec_lens), batch_lab, torch.tensor(label_lens)

# ---------- Load DataFrame & Split ----------
full_df = pd.read_excel("transcripts_with_wav_clips.xlsx")
wav_dir = "cv-corpus-21.0-delta-2025-03-14/en/wav/"
dataset = SpeechDataset(full_df, wav_dir)

# 80/10/10 split
total = len(dataset)
n_train = int(0.8 * total)
n_val = int(0.1 * total)
n_test = total - n_train - n_val
train_ds, val_ds, test_ds = random_split(dataset, [n_train, n_val, n_test])

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)

# ---------- Model Definition ----------
class SpeechRecognitionModel(nn.Module):
    def __init__(self, n_mels=80, hidden=256, vocab_size=len(vocab), num_layers=2):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((1, 2))  # reduce mel bins by 2
        )
        self.fc_in = nn.Linear(32 * (n_mels // 2), hidden)
        self.lstm = nn.LSTM(hidden, hidden, num_layers=num_layers,
                           bidirectional=True, batch_first=True)
        self.classifier = nn.Linear(hidden * 2, vocab_size)

    def forward(self, x, lengths):
        B, T, F = x.size()
        x = x.unsqueeze(1)                 # (B,1,T,F)
        x = self.cnn(x)                    # (B,32,T,F/2)
        x = x.permute(0, 2, 1, 3).reshape(B, T, -1)  # (B,T,32*F/2)
        x = Fn.relu(self.fc_in(x))         # (B,T,hidden)
        x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        x, _ = self.lstm(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        logits = self.classifier(x)
        return logits.log_softmax(dim=-1)

# ---------- Setup Training ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SpeechRecognitionModel().to(device)
ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# ---------- Evaluation Function ----------
def evaluate(loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for specs, spec_lens, labs, lab_lens in loader:
            specs, labs = specs.to(device), labs.to(device)
            out = model(specs, spec_lens).permute(1, 0, 2)
            total_loss += ctc_loss(out, labs, spec_lens, lab_lens).item()
    return total_loss / len(loader)

# ---------- Training Loop ----------
for epoch in range(1, 21):
    model.train()
    train_loss = 0
    for specs, spec_lens, labs, lab_lens in train_loader:
        specs, labs = specs.to(device), labs.to(device)
        optimizer.zero_grad()
        out = model(specs, spec_lens).permute(1, 0, 2)
        loss = ctc_loss(out, labs, spec_lens, lab_lens)
        loss.backward(); optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    val_loss = evaluate(val_loader)
    print(f"Epoch {epoch:2d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
# ---------- Final Test Decoding & WER ----------
# decode tokens for a batch, compute WER using jiwer
from jiwer import wer

def decode_and_evaluate(loader):
    model.eval()
    ground_truths, predictions = [], []
    with torch.no_grad():
        for specs, spec_lens, labs, lab_lens in loader:
            specs = specs.to(device)
            logits = model(specs, spec_lens)
            pred_ids = logits.argmax(-1).cpu().tolist()
            for p_ids, t_ids in zip(pred_ids, labs.tolist()):
                predictions.append(collapse_tokens(p_ids, idx2char))
                ground_truths.append(collapse_tokens(t_ids, idx2char))
    return wer(ground_truths, predictions)

test_wer = decode_and_evaluate(test_loader)
print(f"Test WER: {test_wer:.3f}")

The Final epoch has the following loss:

Epoch 20 | Train Loss: 2.9548 | Val Loss: 3.0483

And the output:

▶ batch label lengths: [52, 89, 81, 102, 35]
▶ batch label lengths: [91, 51, 54, 101, 65]
Test WER: 1.000

I then tried to overfit this code to a single file to see if it works, but it still does not predict anything.

def transcribe_file(path):
    # 1) Extract & shape spec
    spec = extract_features(path).unsqueeze(0)      # (1, T, 80)
    
    # 2) Move ONLY the spectrogram to the device
    spec = spec.to(device)
    
    # 3) Build lengths as a CPU int64 tensor
    length = torch.tensor([spec.size(1)], dtype=torch.int64)
    
    # 4) Run through the model (length stays on CPU)
    model.eval()
    with torch.no_grad():
        logits = model(spec, length)[0]             # (T, V)
    
    # 5) Greedy decode
    pred_ids = logits.argmax(-1).cpu().tolist()    # still on CPU for argmax
    
    # 6) Collapse repeats/blanks
    return collapse_tokens(pred_ids, idx2char)

# Pick a single sample from train_ds:
spec, labels = train_ds[0]
# Create a tiny DataLoader
from torch.utils.data import DataLoader
one_loader = DataLoader([(spec, labels)], batch_size=1, collate_fn=collate_fn)

# Train for 200 epochs on just that one example
model = SpeechRecognitionModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(1, 201):
    model.train()
    for specs, spec_lens, labs, lab_lens in one_loader:
        specs, labs = specs.to(device), labs.to(device)
        optimizer.zero_grad()
        out = model(specs, spec_lens).permute(1,0,2)
        loss = ctc_loss(out, labs, spec_lens, lab_lens)
        loss.backward(); optimizer.step()
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# Now decode it
pred = transcribe_file(os.path.join(wav_dir, train_ds.dataset.wavs[train_ds.indices[0]]))
gt   = collapse_tokens(labels.tolist(), idx2char)
print("GT :", gt)
print("PRED:", pred)

And the output:

Epoch 200, Loss: 2.4376
GT : the busineses are listed in the indonesian and singapore stock exchanges
PRED: 
5
  • I would recommend you to start from something simpler. Try to work with pre-trained models unless this is something you want to get achieved by yourself. Here's a resource you could use using torchaudio as well: deepgram.com/learn/python-speech-recognition-locally-torchaudio Commented Jul 3 at 21:54
  • I am expected to create a model without using any pre-trained models, I know how to use transformers or other pre-trained models, but I do not have experience with ML apart from theory and I am expected to create this within 1 - 1.5 months to compare with whisper and Wav2Vec for WER and CER Commented Jul 3 at 22:05
  • This seems awfully ambitious, who is requiring this? Commented Jul 4 at 1:03
  • btw: there are similar sites: DataScience, CrossValidated, Artificial Intelligence Commented Jul 4 at 1:31
  • It is required by my university as a project. I have already tried this question at crossValidated, members said that it wasn't on topic as it didn't require statistics specifically. Commented Jul 4 at 1:51

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.