How to create a speech recognition model from scratch in Python

Question

I am looking to create a speech recognition model from scratch without using an existing model. I have already used Whisper successfully but I need to create a model that I can train myself whose performance can be measured against whisper.

Is there any resource that I can follow that will guide about how I can create a end to end speech recognizer?

So far, I have followed some code that I could find online and currently it does not give any outputs. I even tried to overfit it to a single file and test it on the same file but it does not seem to be giving any outputs. I have used Mozilla common voice dataset and converted the mp3 files to .wav files at a 16000 hz sample rate

import os
import string
import pandas as pd
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as Fn
from torch.utils.data import Dataset, DataLoader, random_split
# ---------- Feature Extraction ----------
mel_spec = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000,
    n_fft=400,
    win_length=400,
    hop_length=160,
    n_mels=80
)
amp_to_db = torchaudio.transforms.AmplitudeToDB()

def extract_features(wav_path):
    waveform, sr = torchaudio.load(wav_path)
    if sr != 16000:
        waveform = torchaudio.functional.resample(waveform, sr, 16000)
    spec = mel_spec(waveform)
    log_spec = amp_to_db(spec)
    # optional: remove normalization if unstable
    # log_spec = (log_spec - log_spec.mean()) / (log_spec.std() + 1e-8)
    return log_spec.squeeze(0).transpose(0, 1)  # (T, 80)

# ---------- Vocabulary & Tokenizer ----------
english_chars = list(string.ascii_lowercase + " '")
vocab = ['<blank>'] + sorted(set(english_chars))
char2idx = {c: i for i, c in enumerate(vocab)}
idx2char = {i: c for c, i in char2idx.items()}

def text_to_indices(text):
    return [char2idx[c] for c in text.lower() if c in char2idx]

def collapse_tokens(pred_ids, idx2char):
    tokens = []
    prev = None
    for p in pred_ids:
        if p != prev and p != 0:
            tokens.append(idx2char[p])
        prev = p
    return ''.join(tokens)

# ---------- Dataset ----------
class SpeechDataset(Dataset):
    def __init__(self, df: pd.DataFrame, wav_dir: str):
        df_new = df.sample(400)
        self.wavs = df_new['clip'].tolist()
        self.texts = df_new['sentence'].tolist()
        self.wav_dir = wav_dir

    def __len__(self):
        return len(self.wavs)

    def __getitem__(self, idx):
        wav_path = os.path.join(self.wav_dir, self.wavs[idx])
        spec = extract_features(wav_path)
        labels = torch.tensor(text_to_indices(self.texts[idx]), dtype=torch.long)
        return spec, labels


# ---------- Collate Function with Debug ----------
def collate_fn(batch):
    specs, labels = zip(*batch)
    spec_lens = [s.size(0) for s in specs]
    label_lens = [len(l) for l in labels]
    print("▶ batch label lengths:", label_lens[:5])

    max_spec = max(spec_lens)
    max_lab = max(label_lens)
    batch_spec = torch.zeros(len(batch), max_spec, 80)
    batch_lab = torch.zeros(len(batch), max_lab, dtype=torch.long)
    for i, (s, l) in enumerate(zip(specs, labels)):
        batch_spec[i, :s.size(0), :] = s
        batch_lab[i, :l.size(0)] = l
    return batch_spec, torch.tensor(spec_lens), batch_lab, torch.tensor(label_lens)

# ---------- Load DataFrame & Split ----------
full_df = pd.read_excel("transcripts_with_wav_clips.xlsx")
wav_dir = "cv-corpus-21.0-delta-2025-03-14/en/wav/"
dataset = SpeechDataset(full_df, wav_dir)

# 80/10/10 split
total = len(dataset)
n_train = int(0.8 * total)
n_val = int(0.1 * total)
n_test = total - n_train - n_val
train_ds, val_ds, test_ds = random_split(dataset, [n_train, n_val, n_test])

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)

# ---------- Model Definition ----------
class SpeechRecognitionModel(nn.Module):
    def __init__(self, n_mels=80, hidden=256, vocab_size=len(vocab), num_layers=2):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((1, 2))  # reduce mel bins by 2
        )
        self.fc_in = nn.Linear(32 * (n_mels // 2), hidden)
        self.lstm = nn.LSTM(hidden, hidden, num_layers=num_layers,
                           bidirectional=True, batch_first=True)
        self.classifier = nn.Linear(hidden * 2, vocab_size)

    def forward(self, x, lengths):
        B, T, F = x.size()
        x = x.unsqueeze(1)                 # (B,1,T,F)
        x = self.cnn(x)                    # (B,32,T,F/2)
        x = x.permute(0, 2, 1, 3).reshape(B, T, -1)  # (B,T,32*F/2)
        x = Fn.relu(self.fc_in(x))         # (B,T,hidden)
        x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        x, _ = self.lstm(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        logits = self.classifier(x)
        return logits.log_softmax(dim=-1)

# ---------- Setup Training ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SpeechRecognitionModel().to(device)
ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# ---------- Evaluation Function ----------
def evaluate(loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for specs, spec_lens, labs, lab_lens in loader:
            specs, labs = specs.to(device), labs.to(device)
            out = model(specs, spec_lens).permute(1, 0, 2)
            total_loss += ctc_loss(out, labs, spec_lens, lab_lens).item()
    return total_loss / len(loader)

# ---------- Training Loop ----------
for epoch in range(1, 21):
    model.train()
    train_loss = 0
    for specs, spec_lens, labs, lab_lens in train_loader:
        specs, labs = specs.to(device), labs.to(device)
        optimizer.zero_grad()
        out = model(specs, spec_lens).permute(1, 0, 2)
        loss = ctc_loss(out, labs, spec_lens, lab_lens)
        loss.backward(); optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    val_loss = evaluate(val_loader)
    print(f"Epoch {epoch:2d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
# ---------- Final Test Decoding & WER ----------
# decode tokens for a batch, compute WER using jiwer
from jiwer import wer

def decode_and_evaluate(loader):
    model.eval()
    ground_truths, predictions = [], []
    with torch.no_grad():
        for specs, spec_lens, labs, lab_lens in loader:
            specs = specs.to(device)
            logits = model(specs, spec_lens)
            pred_ids = logits.argmax(-1).cpu().tolist()
            for p_ids, t_ids in zip(pred_ids, labs.tolist()):
                predictions.append(collapse_tokens(p_ids, idx2char))
                ground_truths.append(collapse_tokens(t_ids, idx2char))
    return wer(ground_truths, predictions)

test_wer = decode_and_evaluate(test_loader)
print(f"Test WER: {test_wer:.3f}")

The Final epoch has the following loss:

Epoch 20 | Train Loss: 2.9548 | Val Loss: 3.0483

And the output:

▶ batch label lengths: [52, 89, 81, 102, 35]
▶ batch label lengths: [91, 51, 54, 101, 65]
Test WER: 1.000

I then tried to overfit this code to a single file to see if it works, but it still does not predict anything.

def transcribe_file(path):
    # 1) Extract & shape spec
    spec = extract_features(path).unsqueeze(0)      # (1, T, 80)
    
    # 2) Move ONLY the spectrogram to the device
    spec = spec.to(device)
    
    # 3) Build lengths as a CPU int64 tensor
    length = torch.tensor([spec.size(1)], dtype=torch.int64)
    
    # 4) Run through the model (length stays on CPU)
    model.eval()
    with torch.no_grad():
        logits = model(spec, length)[0]             # (T, V)
    
    # 5) Greedy decode
    pred_ids = logits.argmax(-1).cpu().tolist()    # still on CPU for argmax
    
    # 6) Collapse repeats/blanks
    return collapse_tokens(pred_ids, idx2char)

# Pick a single sample from train_ds:
spec, labels = train_ds[0]
# Create a tiny DataLoader
from torch.utils.data import DataLoader
one_loader = DataLoader([(spec, labels)], batch_size=1, collate_fn=collate_fn)

# Train for 200 epochs on just that one example
model = SpeechRecognitionModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(1, 201):
    model.train()
    for specs, spec_lens, labs, lab_lens in one_loader:
        specs, labs = specs.to(device), labs.to(device)
        optimizer.zero_grad()
        out = model(specs, spec_lens).permute(1,0,2)
        loss = ctc_loss(out, labs, spec_lens, lab_lens)
        loss.backward(); optimizer.step()
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# Now decode it
pred = transcribe_file(os.path.join(wav_dir, train_ds.dataset.wavs[train_ds.indices[0]]))
gt   = collapse_tokens(labels.tolist(), idx2char)
print("GT :", gt)
print("PRED:", pred)

And the output:

Epoch 200, Loss: 2.4376
GT : the busineses are listed in the indonesian and singapore stock exchanges
PRED:

I would recommend you to start from something simpler. Try to work with pre-trained models unless this is something you want to get achieved by yourself. Here's a resource you could use using torchaudio as well: deepgram.com/learn/python-speech-recognition-locally-torchaudio — Daniel Hernandez
– Daniel Hernandez, Commented Jul 3 at 21:54
I am expected to create a model without using any pre-trained models, I know how to use transformers or other pre-trained models, but I do not have experience with ML apart from theory and I am expected to create this within 1 - 1.5 months to compare with whisper and Wav2Vec for WER and CER — FaisalShakeel
– FaisalShakeel, Commented Jul 3 at 22:05
btw: there are similar sites: DataScience, CrossValidated, Artificial Intelligence — furas
– furas, Commented Jul 4 at 1:31
It is required by my university as a project. I have already tried this question at crossValidated, members said that it wasn't on topic as it didn't require statistics specifically. — FaisalShakeel
– FaisalShakeel, Commented Jul 4 at 1:51

Collectives™ on Stack Overflow

How to create a speech recognition model from scratch in Python

0

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest