CMUSphinx word_align.pl Ошибка 65280 при обучении новому языку
Во время обучения моему языку я получаю эту ошибку. Я не использую свойство force_align. Исполняемые файлы Sphinx_fe и pocketsphinx_batch находятся в пути. В файле журнала декодирования не указана ошибка. Я проследил учебный документ и прочитал все темы об этой ошибке, но мне не повезло. Вот мой журнал фазы декодирования и файлы конфигурации: Заранее спасибо.
# Configuration script for sphinx trainer -*-mode:Perl-*-
$CFG_VERBOSE = 1; # Determines how much goes to the screen.
# These are filled in at configuration time
$CFG_DB_NAME = "tur";
# Experiment name, will be used to name model files and log files
$CFG_EXPTNAME = "$CFG_DB_NAME";
# Directory containing SphinxTrain binaries
$CFG_BASE_DIR = "C:/sphinx/tur";
$CFG_SPHINXTRAIN_DIR = "C:/sphinx/sphinxtrain";
$CFG_BIN_DIR = "C:/sphinx/sphinxtrain/bin/Release/x64";
$CFG_SCRIPT_DIR = "C:/sphinx/sphinxtrain/scripts";
# Audio waveform and feature file information
$CFG_WAVFILES_DIR = "$CFG_BASE_DIR/wav";
$CFG_WAVFILE_EXTENSION = 'wav';
$CFG_WAVFILE_TYPE = 'mswav'; # one of nist, mswav, raw
$CFG_FEATFILES_DIR = "$CFG_BASE_DIR/feat";
$CFG_FEATFILE_EXTENSION = 'mfc';
# Feature extraction parameters
$CFG_WAVFILE_SRATE = 16000.0;
$CFG_NUM_FILT = 25; # For wideband speech it's 25, for telephone 8khz reasonable value is 15
$CFG_LO_FILT = 130; # For telephone 8kHz speech value is 200
$CFG_HI_FILT = 6800; # For telephone 8kHz speech value is 3500
$CFG_TRANSFORM = "dct"; # Previously legacy transform is used, but dct is more accurate
$CFG_LIFTER = "22"; # Cepstrum lifter is smoothing to improve recognition
$CFG_VECTOR_LENGTH = 13; # 13 is usually enough
$CFG_MIN_ITERATIONS = 1; # BW Iterate at least this many times
$CFG_MAX_ITERATIONS = 10; # BW Don't iterate more than this, somethings likely wrong.
# (none/max) Type of AGC to apply to input files
$CFG_AGC = 'none';
# (current/none) Type of cepstral mean subtraction/normalization
# to apply to input files
$CFG_CMN = 'batch';
# (yes/no) Normalize variance of input files to 1.0
$CFG_VARNORM = 'no';
# (yes/no) Train full covariance matrices
$CFG_FULLVAR = 'no';
# (yes/no) Use diagonals only of full covariance matrices for
# Forward-Backward evaluation (recommended if CFG_FULLVAR is yes)
$CFG_DIAGFULL = 'no';
# (yes/no) Perform vocal tract length normalization in training. This
# will result in a "normalized" model which requires VTLN to be done
# during decoding as well.
$CFG_VTLN = 'no';
# Starting warp factor for VTLN
$CFG_VTLN_START = 0.80;
# Ending warp factor for VTLN
$CFG_VTLN_END = 1.40;
# Step size of warping factors
$CFG_VTLN_STEP = 0.05;
# Directory to write queue manager logs to
$CFG_QMGR_DIR = "$CFG_BASE_DIR/qmanager";
# Directory to write training logs to
$CFG_LOG_DIR = "$CFG_BASE_DIR/logdir";
# Directory for re-estimation counts
$CFG_BWACCUM_DIR = "$CFG_BASE_DIR/bwaccumdir";
# Directory to write model parameter files to
$CFG_MODEL_DIR = "$CFG_BASE_DIR/model_parameters";
# Directory containing transcripts and control files for
# speaker-adaptive training
$CFG_LIST_DIR = "$CFG_BASE_DIR/etc";
# Decoding variables for MMIE training
$CFG_LANGUAGEWEIGHT = "11.5";
$CFG_BEAMWIDTH = "1e-100";
$CFG_WORDBEAM = "1e-80";
$CFG_LANGUAGEMODEL = "$CFG_LIST_DIR/$CFG_DB_NAME.lm.DMP";
$CFG_WORDPENALTY = "0.2";
# Lattice pruning variables
$CFG_ABEAM = "1e-50";
$CFG_NBEAM = "1e-10";
$CFG_PRUNED_DENLAT_DIR = "$CFG_BASE_DIR/pruned_denlat";
# MMIE training related variables
$CFG_MMIE = "no";
$CFG_MMIE_MAX_ITERATIONS = 5;
$CFG_LATTICE_DIR = "$CFG_BASE_DIR/lattice";
$CFG_MMIE_TYPE = "rand"; # Valid values are "rand", "best" or "ci"
$CFG_MMIE_CONSTE = "3.0";
$CFG_NUMLAT_DIR = "$CFG_BASE_DIR/numlat";
$CFG_DENLAT_DIR = "$CFG_BASE_DIR/denlat";
# Variables used in main training of models
$CFG_DICTIONARY = "$CFG_LIST_DIR/$CFG_DB_NAME.dic";
$CFG_RAWPHONEFILE = "$CFG_LIST_DIR/$CFG_DB_NAME.phone";
$CFG_FILLERDICT = "$CFG_LIST_DIR/$CFG_DB_NAME.filler";
$CFG_LISTOFFILES = "$CFG_LIST_DIR/${CFG_DB_NAME}_train.fileids";
$CFG_TRANSCRIPTFILE = "$CFG_LIST_DIR/${CFG_DB_NAME}_train.transcription";
$CFG_FEATPARAMS = "$CFG_LIST_DIR/feat.params";
# Variables used in characterizing models
$CFG_HMM_TYPE = '.cont.'; # Sphinx 4, PocketSphinx
#$CFG_HMM_TYPE = '.semi.'; # PocketSphinx
#$CFG_HMM_TYPE = '.ptm.'; # PocketSphinx (larger data sets)
if (($CFG_HMM_TYPE ne ".semi.")
and ($CFG_HMM_TYPE ne ".ptm.")
and ($CFG_HMM_TYPE ne ".cont.")) {
die "Please choose one CFG_HMM_TYPE out of '.cont.', '.ptm.', or '.semi.', " .
"currently $CFG_HMM_TYPE\n";
}
# This configuration is fastest and best for most acoustic models in
# PocketSphinx and Sphinx-III. See below for Sphinx-II.
$CFG_STATESPERHMM = 3;
$CFG_SKIPSTATE = 'no';
if ($CFG_HMM_TYPE eq '.semi.') {
$CFG_DIRLABEL = 'semi';
# Four stream features for PocketSphinx
$CFG_FEATURE = "s2_4x";
$CFG_NUM_STREAMS = 4;
$CFG_INITIAL_NUM_DENSITIES = 8;
$CFG_FINAL_NUM_DENSITIES = 8;
die "For semi continuous models, the initial and final models have the same density"
if ($CFG_INITIAL_NUM_DENSITIES != $CFG_FINAL_NUM_DENSITIES);
} elsif ($CFG_HMM_TYPE eq '.ptm.') {
$CFG_DIRLABEL = 'ptm';
# Four stream features for PocketSphinx
$CFG_FEATURE = "s2_4x";
$CFG_NUM_STREAMS = 4;
$CFG_INITIAL_NUM_DENSITIES = 8;
$CFG_FINAL_NUM_DENSITIES = 8;
die "For phonetically tied models, the initial and final models have the same density"
if ($CFG_INITIAL_NUM_DENSITIES != $CFG_FINAL_NUM_DENSITIES);
} elsif ($CFG_HMM_TYPE eq '.cont.') {
$CFG_DIRLABEL = 'cont';
# Single stream features - Sphinx 3
$CFG_FEATURE = "1s_c_d_dd";
$CFG_NUM_STREAMS = 1;
$CFG_INITIAL_NUM_DENSITIES = 1;
$CFG_FINAL_NUM_DENSITIES = 8;
die "The initial has to be less than the final number of densities"
if ($CFG_INITIAL_NUM_DENSITIES > $CFG_FINAL_NUM_DENSITIES);
}
# Number of top gaussians to score a frame. A little bit less accurate computations
# make training significantly faster. Uncomment to apply this during the training
# For good accuracy make sure you are using the same setting in decoder
# In theory this can be different for various training stages. For example 4 for
# CI stage and 16 for CD stage
# $CFG_CI_TOPN = 4;
# $CFG_CD_TOPN = 16;
# (yes/no) Train multiple-gaussian context-independent models (useful
# for alignment, use 'no' otherwise) in the models created
# specifically for forced alignment
$CFG_FALIGN_CI_MGAU = 'no';
# (yes/no) Train multiple-gaussian context-independent models (useful
# for alignment, use 'no' otherwise)
$CFG_CI_MGAU = 'no';
# (yes/no) Train context-dependent models
$CFG_CD_TRAIN = 'no';
# Number of tied states (senones) to create in decision-tree clustering
$CFG_N_TIED_STATES = 200;
# How many parts to run Forward-Backward estimatinon in
$CFG_NPART = 1;
# (yes/no) Train a single decision tree for all phones (actually one
# per state) (useful for grapheme-based models, use 'no' otherwise)
$CFG_CROSS_PHONE_TREES = 'no';
# Use force-aligned transcripts (if available) as input to training
$CFG_FORCEDALIGN = 'no';
# Use a specific set of models for force alignment. If not defined,
# context-independent models for the current experiment will be used.
#$CFG_FORCE_ALIGN_MODELDIR = "$CFG_MODEL_DIR/$CFG_EXPTNAME.falign_ci_$CFG_DIRLABEL";
# Use a specific dictionary and filler dictionary for force alignment.
# If these are not defined, a dictionary and filler dictionary will be
# created from $CFG_DICTIONARY and $CFG_FILLERDICT, with noise words
# removed from the filler dictionary and added to the dictionary (this
# is because the force alignment is not very good at inserting them)
# $CFG_FORCE_ALIGN_DICTIONARY = "$ST::CFG_BASE_DIR/falignout$ST::CFG_EXPTNAME.falign.dict";;
# $CFG_FORCE_ALIGN_FILLERDICT = "$ST::CFG_BASE_DIR/falignout/$ST::CFG_EXPTNAME.falign.fdict";;
# Use a particular beam width for force alignment. The wider
# (i.e. smaller numerically) the beam, the fewer sentences will be
# rejected for bad alignment.
$CFG_FORCE_ALIGN_BEAM = 1e-60;
# Calculate an LDA/MLLT transform?
$CFG_LDA_MLLT = 'no';
# Dimensionality of LDA/MLLT output
$CFG_LDA_DIMENSION = 29;
# This is actually just a difference in log space (it doesn't make
# sense otherwise, because different feature parameters have very
# different likelihoods)
$CFG_CONVERGENCE_RATIO = 0.1;
# Queue::POSIX for multiple CPUs on a local machine
# Queue::PBS to use a PBS/TORQUE queue
$CFG_QUEUE_TYPE = "Queue";
# Name of queue to use for PBS/TORQUE
$CFG_QUEUE_NAME = "workq";
# (yes/no) Build questions for decision tree clustering automatically
$CFG_MAKE_QUESTS = "yes";
# If CFG_MAKE_QUESTS is yes, questions are written to this file.
# If CFG_MAKE_QUESTS is no, questions are read from this file.
$CFG_QUESTION_SET = "${CFG_BASE_DIR}/model_architecture/${CFG_EXPTNAME}.tree_questions";
#$CFG_QUESTION_SET = "${CFG_BASE_DIR}/linguistic_questions";
$CFG_CP_OPERATION = "${CFG_BASE_DIR}/model_architecture/${CFG_EXPTNAME}.cpmeanvar";
# Configuration for grapheme-to-phoneme model
$CFG_G2P_MODEL= 'no';
# Configuration script for sphinx decoder
# Variables starting with $DEC_CFG_ refer to decoder specific
# arguments, those starting with $CFG_ refer to trainer arguments,
# some of them also used by the decoder.
$DEC_CFG_VERBOSE = 1; # Determines how much goes to the screen.
# These are filled in at configuration time
# Name of the decoding script to use (psdecode.pl or s3decode.pl, probably)
$DEC_CFG_SCRIPT = 'psdecode.pl';
$DEC_CFG_EXPTNAME = "$CFG_EXPTNAME";
$DEC_CFG_JOBNAME = "$CFG_EXPTNAME"."_job";
# Models to use.
$DEC_CFG_MODEL_NAME = "$CFG_EXPTNAME.ci_cont";
$DEC_CFG_FEATFILES_DIR = "$CFG_BASE_DIR/feat";
$DEC_CFG_FEATFILE_EXTENSION = '.mfc';
$DEC_CFG_AGC = $CFG_AGC;
$DEC_CFG_CMN = $CFG_CMN;
$DEC_CFG_VARNORM = $CFG_VARNORM;
$DEC_CFG_QMGR_DIR = "$CFG_BASE_DIR/qmanager";
$DEC_CFG_LOG_DIR = "$CFG_BASE_DIR/logdir";
$DEC_CFG_MODEL_DIR = "$CFG_MODEL_DIR";
$DEC_CFG_DICTIONARY = "$CFG_BASE_DIR/etc/$CFG_DB_NAME.dic";
$DEC_CFG_FILLERDICT = "$CFG_BASE_DIR/etc/$CFG_DB_NAME.filler";
$DEC_CFG_LISTOFFILES = "$CFG_BASE_DIR/etc/${CFG_DB_NAME}_test.fileids";
$DEC_CFG_TRANSCRIPTFILE = "$CFG_BASE_DIR/etc/${CFG_DB_NAME}_test.transcription";
$DEC_CFG_RESULT_DIR = "$CFG_BASE_DIR/result";
$DEC_CFG_PRESULT_DIR = "$CFG_BASE_DIR/presult";
# This variables, used by the decoder, have to be user defined, and
# may affect the decoder output
#$DEC_CFG_LANGUAGEMODEL = "$CFG_BASE_DIR/etc/${CFG_DB_NAME}.lm.DMP";
# Or can be JSGF or FSG too, used if uncommented
$DEC_CFG_GRAMMAR = "$CFG_BASE_DIR/etc/${CFG_DB_NAME}.jsgf";
# $DEC_CFG_FSG = "$CFG_BASE_DIR/etc/${CFG_DB_NAME}.fsg";
$DEC_CFG_LANGUAGEWEIGHT = "10";
$DEC_CFG_BEAMWIDTH = "1e-80";
$DEC_CFG_WORDBEAM = "1e-40";
$DEC_CFG_WORDPENALTY = "0.2";
$DEC_CFG_ALIGN = "builtin";
$DEC_CFG_NPART = 1; # Define how many pieces to split decode in
# This variable has to be defined, otherwise utils.pl will not load.
$CFG_DONE = 1;
return 1;
INFO: pocketsphinx.c(152): Parsed model-specific feature parameters from C:/sphinx/tur/model_parameters/tur.ci_cont/feat.params
Current configuration:
[NAME] [DEFLT] [VALUE]
-agc none none
-agcthresh 2.0 2.000000e+00
-allphone
-allphone_ci yes yes
-alpha 0.97 9.700000e-01
-ascale 20.0 2.000000e+01
-aw 1 1
-backtrace no no
-beam 1e-48 1.000000e-80
-bestpath yes yes
-bestpathlw 9.5 1.000000e+01
-ceplen 13 13
-cmn live batch
-cmninit 40,3,-1 40,3,-1
-compallsen no no
-dict C:/sphinx/tur/etc/tur.dic
-dictcase no no
-dither no no
-doublebw no no
-ds 1 1
-fdict
-feat 1s_c_d_dd 1s_c_d_dd
-featparams
-fillprob 1e-8 1.000000e-08
-frate 100 100
-fsg
-fsgusealtpron yes yes
-fsgusefiller yes yes
-fwdflat yes yes
-fwdflatbeam 1e-64 1.000000e-80
-fwdflatefwid 4 4
-fwdflatlw 8.5 1.000000e+01
-fwdflatsfwin 25 25
-fwdflatwbeam 7e-29 1.000000e-40
-fwdtree yes yes
-hmm C:/sphinx/tur/model_parameters/tur.ci_cont
-input_endian little little
-jsgf C:/sphinx/tur/etc/tur.jsgf
-keyphrase
-kws
-kws_delay 10 10
-kws_plp 1e-1 1.000000e-01
-kws_threshold 1e-30 1.000000e-30
-latsize 5000 5000
-lda
-ldadim 0 0
-lifter 0 22
-lm
-lmctl
-lmname
-logbase 1.0001 1.000100e+00
-logfn
-logspec no no
-lowerf 133.33334 1.300000e+02
-lpbeam 1e-40 1.000000e-80
-lponlybeam 7e-29 1.000000e-80
-lw 6.5 1.000000e+01
-maxhmmpf 30000 30000
-maxwpf -1 -1
-mdef
-mean
-mfclogdir
-min_endfr 0 0
-mixw
-mixwfloor 0.0000001 1.000000e-07
-mllr
-mmap yes yes
-ncep 13 13
-nfft 512 512
-nfilt 40 25
-nwpen 1.0 1.000000e+00
-pbeam 1e-48 1.000000e-80
-pip 1.0 1.000000e+00
-pl_beam 1e-10 1.000000e-10
-pl_pbeam 1e-10 1.000000e-10
-pl_pip 1.0 1.000000e+00
-pl_weight 3.0 3.000000e+00
-pl_window 5 5
-rawlogdir
-remove_dc no no
-remove_noise yes yes
-remove_silence yes yes
-round_filters yes yes
-samprate 16000 1.600000e+04
-seed -1 -1
-sendump
-senlogdir
-senmgau
-silprob 0.005 5.000000e-03
-smoothspec no no
-svspec
-tmat
-tmatfloor 0.0001 1.000000e-04
-topn 4 4
-topn_beam 0 0
-toprule
-transform legacy dct
-unit_area yes yes
-upperf 6855.4976 6.800000e+03
-uw 1.0 1.000000e+00
-vad_postspeech 50 50
-vad_prespeech 20 20
-vad_startspeech 10 10
-vad_threshold 3.0 3.000000e+00
-var
-varfloor 0.0001 1.000000e-04
-varnorm no no
-verbose no no
-warp_params
-warp_type inverse_linear inverse_linear
-wbeam 7e-29 1.000000e-40
-wip 0.65 2.000000e-01
-wlen 0.025625 2.562500e-02
INFO: feat.c(715): Initializing feature stream to type: '1s_c_d_dd', ceplen=13, CMN='batch', VARNORM='no', AGC='none'
INFO: mdef.c(518): Reading model definition: C:/sphinx/tur/model_parameters/tur.ci_cont/mdef
INFO: bin_mdef.c(181): Allocating 92 * 8 bytes (0 KiB) for CD tree
INFO: tmat.c(149): Reading HMM transition probability matrices: C:/sphinx/tur/model_parameters/tur.ci_cont/transition_matrices
INFO: acmod.c(113): Attempting to use PTM computation module
INFO: ms_gauden.c(127): Reading mixture gaussian parameter: C:/sphinx/tur/model_parameters/tur.ci_cont/means
INFO: ms_gauden.c(242): 66 codebook, 1 feature, size:
INFO: ms_gauden.c(244): 1x39
INFO: ms_gauden.c(127): Reading mixture gaussian parameter: C:/sphinx/tur/model_parameters/tur.ci_cont/variances
INFO: ms_gauden.c(242): 66 codebook, 1 feature, size:
INFO: ms_gauden.c(244): 1x39
INFO: ms_gauden.c(304): 0 variance values floored
INFO: ptm_mgau.c(807): Number of codebooks doesn't match number of ciphones, doesn't look like PTM: 66 != 22
INFO: acmod.c(115): Attempting to use semi-continuous computation module
INFO: ms_gauden.c(127): Reading mixture gaussian parameter: C:/sphinx/tur/model_parameters/tur.ci_cont/means
INFO: ms_gauden.c(242): 66 codebook, 1 feature, size:
INFO: ms_gauden.c(244): 1x39
INFO: ms_gauden.c(127): Reading mixture gaussian parameter: C:/sphinx/tur/model_parameters/tur.ci_cont/variances
INFO: ms_gauden.c(242): 66 codebook, 1 feature, size:
INFO: ms_gauden.c(244): 1x39
INFO: ms_gauden.c(304): 0 variance values floored
INFO: acmod.c(117): Falling back to general multi-stream GMM computation
INFO: ms_gauden.c(127): Reading mixture gaussian parameter: C:/sphinx/tur/model_parameters/tur.ci_cont/means
INFO: ms_gauden.c(242): 66 codebook, 1 feature, size:
INFO: ms_gauden.c(244): 1x39
INFO: ms_gauden.c(127): Reading mixture gaussian parameter: C:/sphinx/tur/model_parameters/tur.ci_cont/variances
INFO: ms_gauden.c(242): 66 codebook, 1 feature, size:
INFO: ms_gauden.c(244): 1x39
INFO: ms_gauden.c(304): 0 variance values floored
INFO: ms_senone.c(149): Reading senone mixture weights: C:/sphinx/tur/model_parameters/tur.ci_cont/mixture_weights
INFO: ms_senone.c(200): Truncating senone logs3(pdf) values by 10 bits
INFO: ms_senone.c(207): Not transposing mixture weights in memory
INFO: ms_senone.c(268): Read mixture weights for 66 senones: 1 features x 1 codewords
INFO: ms_senone.c(320): Mapping senones to individual codebooks
INFO: ms_mgau.c(144): The value of topn: 4
WARN: "ms_mgau.c", line 148: -topn argument (4) invalid or > #density codewords (1); set to latter
INFO: phone_loop_search.c(114): State beam -225 Phone exit beam -225 Insertion penalty 0
INFO: dict.c(320): Allocating 4105 * 32 bytes (128 KiB) for word entries
INFO: dict.c(333): Reading main dictionary: C:/sphinx/tur/etc/tur.dic
INFO: dict.c(213): Dictionary size 6, allocated 0 KiB for strings, 0 KiB for phones
INFO: dict.c(336): 6 words read
INFO: dict.c(358): Reading filler dictionary: C:/sphinx/tur/model_parameters/tur.ci_cont/noisedict
INFO: dict.c(213): Dictionary size 9, allocated 0 KiB for strings, 0 KiB for phones
INFO: dict.c(361): 3 words read
INFO: dict2pid.c(396): Building PID tables for dictionary
INFO: dict2pid.c(406): Allocating 22^3 * 2 bytes (20 KiB) for word-initial triphones
INFO: dict2pid.c(132): Allocated 11792 bytes (11 KiB) for word-final triphones
INFO: dict2pid.c(196): Allocated 11792 bytes (11 KiB) for single-phone word triphones
INFO: jsgf.c(709): Defined rule:
INFO: jsgf.c(709): Defined rule: PUBLIC
INFO: fsg_model.c(208): Computing transitive closure for null transitions
INFO: fsg_model.c(270): 0 null transitions added
INFO: fsg_search.c(227): FSG(beam: -1799, pbeam: -1799, wbeam: -900; wip: -158, pip: 0)
INFO: fsg_model.c(423): Adding silence transitions for to FSG
INFO: fsg_model.c(443): Added 2 silence word transitions
INFO: fsg_search.c(173): Added 0 alternate word transitions
INFO: fsg_lextree.c(110): Allocated 92 bytes (0 KiB) for left and right context phones
INFO: fsg_lextree.c(256): 37 HMM nodes in lextree (8 leaves)
INFO: fsg_lextree.c(259): Allocated 5328 bytes (5 KiB) for all lextree nodes
INFO: fsg_lextree.c(262): Allocated 1152 bytes (1 KiB) for lextree leafnodes
INFO: batch.c(778): TOTAL 0.00 seconds speech, 0.00 seconds CPU, 0.00 seconds wall
INFO: batch.c(780): AVERAGE -nan(ind) xRT (CPU), -nan(ind) xRT (elapsed)
INFO: fsg_search.c(265): TOTAL fsg 0.00 CPU -nan(ind) xRT
INFO: fsg_search.c(268): TOTAL fsg 0.00 wall -nan(ind) xRT
Tue Jul 24 11:29:58 2018