kaldi actual combat learning (1) Speaker recognition a small example (EGS/AISHELL/V1) Robin



# address variables (quotation marks can be omitted, but = there must be space between?)

# Load the reference file (use., Similar to the import)
. ./cmd.sh
. ./path.sh

# Set Bash error and exit
set -e # exit on error

# download and decompress the data (should be one of the two)
local/download_and_untar.sh $data $data_url data_aishell
local/download_and_untar.sh $data $data_url resource_aishell
# (DOWNLOAD_AND_UNTAR.SH <data/data download address> <Data output location>)

# Data preparation (should be extracted from WAV data and generate "explanation document"?)
local/aishell_data_prep.sh $data/data_aishell/wav $data/data_aishell/transcript

# extract MFCC features (It is recommended to use a large -capacity disk to store MFCC)
# each WAV is first divided into many frames. Each frame is represented by a series of numbers. MFCC is this string of numbers


for x in train test; do

  steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir
  # MAKE_MFCC.SH <Data Folder> <Log Folder> <MFCC Output Folder>;
  # 12 -Stimemaking after the symbols
  sid/compute_vad_decision.sh --nj 10 --cmd "$train_cmd" data/$x exp/make_mfcc/$x $mfccdir
  # compute_vad_decision.sh <<data folder> <Log folder> <MFCC output folder>
  utils/fix_data_dir.sh data/$x
  # fix_data_dir.sh requires only one parameter

# 15 Diag ubm
sid/train_diag_ubm.sh --nj 10 --cmd "$train_cmd" --num-threads 16 \
  data/train 1024 exp/diag_ubm_1024
# 1 Full ubm
sid/train_full_ubm.sh --nj 10 --cmd "$train_cmd" data/train \
  exp/diag_ubm_1024 exp/full_ubm_1024
# Training Ivector
sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 10G" \
  --num-iters 5 exp/full_ubm_1024/final.ubm data/train \

# extract Ivector
sid/extract_ivectors.sh --cmd "$train_cmd" --nj 10 \
  exp/extractor_1024 data/train exp/ivector_train_1024

# Training PLDA
$train_cmd exp/ivector_train_1024/log/plda.log \
  ivector-compute-plda ark:data/train/spk2utt \
  'ark:ivector-normalize-length scp:exp/ivector_train_1024/ivector.scp  ark:- |' \

# --- ↓ split the test to enroll and eval ↓ ---
mkdir -p data/test/enroll data/test/eval
cp data/test/{
    spk2utt,feats.scp,vad.scp} data/test/enroll
cp data/test/{
    spk2utt,feats.scp,vad.scp} data/test/eval

local/split_data_enroll_eval.py data/test/utt2spk  data/test/enroll/utt2spk  data/test/eval/utt2spk
# split_data_enroll_eval.py 
# 20 20 20 (UTT2SPK) is split into a registration set and verification set
# Input: Test UTT2SPK; Output: Enroll UTT2SPK, Eval UTT2SPK
# For each speaker, the three audio of the lower audio is randomly selected as the verification set, and the remaining other is used as a verification set test

local/produce_trials.py data/test/eval/utt2spk $trials
# local/produce_trials.py
# This script is used to generate trails file: Uttid spkid target | nontarget

utils/fix_data_dir.sh data/test/enroll
utils/fix_data_dir.sh data/test/eval
# utils/fix_data_dir.sh
# This script makes sure that only the segments present in
# all of "feats.scp", "wav.scp" [if present], segments [if present]
# text, and utt2spk are present in any of them.
# It puts the original contents of data-dir into
# data-dir/.backup

# --- ↑ split the test to enroll and eval ↑ ---

# extract the registration set ivector
sid/extract_ivectors.sh --cmd "$train_cmd" --nj 10 \
  exp/extractor_1024 data/test/enroll  exp/ivector_enroll_1024

# extract the Ivector of the verification set
sid/extract_ivectors.sh --cmd "$train_cmd" --nj 10 \
  exp/extractor_1024 data/test/eval  exp/ivector_eval_1024

# Calculate PLDA score

$train_cmd exp/ivector_eval_1024/log/plda_score.log \
  ivector-plda-scoring --num-utts=ark:exp/ivector_enroll_1024/num_utts.ark \
  exp/ivector_train_1024/plda \
  ark:exp/ivector_enroll_1024/spk_ivector.ark \
  "ark:ivector-normalize-length scp:exp/ivector_eval_1024/ivector.scp ark:- |" \
  "cat '$trials' | awk '{print \\\$2, \\\$1}' |" exp/trials_out

# ~kaldi/src/ivectorbin/ivector-plda-scoring.cc
# Use the PLDA model to calculate Trails's matched ratio (OG-Likelihood Ratios)
# Input (Trials-File): "<Key1> <Key2> \ N"
# Output: "<Key1> <Key2> [<Dot-PRODUCT>] \ n"

# Calculate EER
awk '{print $3}' exp/trials_out | paste - $trials | awk '{print $1, $4}' | compute-eer -

# Result
# Scoring against data/test/aishell_speaker_ver.lst
# Equal error rate is 0.140528%, at threshold -12.018

exit 0

  1. Linux Shell script some basic knowledge: for example$ variable<Fu> = <Value>-Super Porter Settings. Load
  2. kaldi Most of the speaker recognition


  1. From the speaker recognition DEMO, learn kaldi – (1) run.sh
  2. kaldi Introduction: Create the first Chinese ASR (AISHELL-1)


Random Posts

Multi -process read and write lock AKA

ncl installation

QT’s default file path path path obtaining method

MYBATIS After inserting the data, the object immediately obtains the ID method

Use pytorch to classify image