附录1:语音识别主程序
disp('绘制时域图和幅频图')
[y,FS]=audioread('a1.m4a');%读入语音文件
y1=audioplayer(y,FS);
play(y1);%播放文件
subplot(2,1,1);
N=length(y);
n=0:N-1;
n=n/FS;
plot(n,y);%绘制时域图像
xlabel('时间/s');ylabel('百分贝');title('时域波形图');
subplot(2,1,2);
DrawFFT(y,FS);title('幅频波形图');
pause(1);
figure;
[y,FS]=audioread('a2.m4a');
y2=audioplayer(y,FS);
play(y2);
subplot(2,1,1);
N=length(y);
n=0:N-1;
n=n/FS;
plot(n,y);
xlabel('时间/s');ylabel('百分贝');title('时域波形图');
subplot(2,1,2);
DrawFFT(y,FS);title('幅频波形图');
pause(1);
figure;
[y,FS]=audioread('a3.m4a');
y3=audioplayer(y,FS);
play(y3);
subplot(2,1,1);
N=length(y);
n=0:N-1;
n=n/FS;
plot(n,y);
xlabel('时间/s');ylabel('百分贝');title('时域波形图');
subplot(2,1,2);
DrawFFT(y,FS);title('幅频波形图');
pause(1);
figure;
[y,FS]=audioread('a4.m4a');
y4=audioplayer(y,FS);
play(y4);
subplot(2,1,1);
N=length(y);
n=0:N-1;
n=n/FS;
plot(n,y);
xlabel('时间/s');ylabel('百分贝');title('时域波形图');
subplot(2,1,2);
DrawFFT(y,FS);title('幅频波形图');
pause(1);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
disp('正在计算参考模板的参数...')
figure;
fname=sprintf('a1.m4a');
title('参考模版1的参数');
[a fs]=audioread(fname);
[a1 a2]=vad(a);
m1=mfcc(a);
m1=m1(a1-2:a2-2,:);
ref(1).mfcc=m1;
figure;
fname=sprintf('a2.m4a');
[b fs1]=audioread(fname);
[b1 b2]=vad(b);
m2=mfcc(b);
m2=m2(b1-2:b2-2,:);
ref(2).mfcc=m2;
figure;
fname=sprintf('a3.m4a');
[c fs2]=audioread(fname);
[c1 c2]=vad(c);
m3=mfcc(c);
m3=m3(c1-2:c2-2,:);
ref(3).mfcc=m3;
figure;
fname=sprintf('a4.m4a');
[d fs3]=audioread(fname);
[d1 d2]=vad(d);
m4=mfcc(d);
m4=m4(d1-2:d2-2,:);
ref(4).mfcc=m4;
disp('正在计算测试模板1的参数...')
figure;
fname=sprintf('b1.m4a');
[e fs5]=audioread(fname);
[e1 e2]=vad(e);
m5=mfcc(e);
m5=m5(e1-2:e2-2,:);
test(1).mfcc=m5;
figure;
fname=sprintf('b2.m4a');
[f fs6]=audioread(fname);
[f1 f2]=vad(f);
m6=mfcc(f);
m6=m6(f1-2:f2-2,:);
test(2).mfcc=m6;
figure;
fname=sprintf('b3.m4a');
[g fs7]=audioread(fname);
[g1 g2]=vad(g);
m6=mfcc(g);
m6=m6(g1-2:g2-2,:);
test(3).mfcc=m6;
figure;
fname=sprintf('b4.m4a');
[h fs8]=audioread(fname);
[h1 h2]=vad(h);
m8=mfcc(h);
m8=m8(h1-2:h2-2,:);
test(4).mfcc=m8;
disp('正在进行模板匹配...')
dist=zeros(4,4);
for i=1:4
for j=1:4
dist(i,j)=dtw(test(i).mfcc,ref(j).mfcc);
end
end
disp('正在计算匹配结果...')
for i=1:4
[d,j]=min(dist(i,:));
fprintf('测试模板%d的识别结果为:%d\n',i,j);
end
disp('正在计算测试模板2的参数...')
figure;
fname=sprintf('c1.m4a');
[e fs5]=audioread(fname);
[e1 e2]=vad(e);
m9=mfcc(e);
m9=m9(e1-2:e2-2,:);
test(1).mfcc=m9;
figure;
fname=sprintf('c2.m4a');
[f fs6]=audioread(fname);
[f1 f2]=vad(f);
m10=mfcc(f);
m10=m10(f1-2:f2-2,:);
test(2).mfcc=m10;
figure;
fname=sprintf('c3.m4a');
[g fs7]=audioread(fname);
[g1 g2]=vad(g);
m11=mfcc(g);
m11=m11(g1-2:g2-2,:);
test(3).mfcc=m11;
figure;
fname=sprintf('c4.m4a');
[h fs8]=audioread(fname);
[h1 h2]=vad(h);
m12=mfcc(h);
m12=m12(h1-2:h2-2,:);
test(4).mfcc=m12;
disp('正在进行模板匹配...')
dist=zeros(4,4);
for t=1:4
for j=1:4
dist(t,j)=dtw(test(t).mfcc,ref(j).mfcc);
end
end
disp('正在计算匹配结果...')
for t=1:4
[d,j]=min(dist(t,:));
fprintf('测试模板%d的识别结果为:%d\n',t,j);
end
disp('正在计算随机模板的参数...')
figure;
fname=sprintf('b4.m4a');
[e fs5]=audioread(fname);
[e1 e2]=vad(e);
m13=mfcc(e);
m13=m13(e1-2:e2-2,:);
test(1).mfcc=m13;
figure;
fname=sprintf('c1.m4a');
[f fs6]=audioread(fname);
[f1 f2]=vad(f);
m14=mfcc(f);
m14=m14(f1-2:f2-2,:);
test(2).mfcc=m14;
figure;
fname=sprintf('b2.m4a');
[g fs7]=audioread(fname);
[g1 g2]=vad(g);
m15=mfcc(g);
m15=m15(g1-2:g2-2,:);
test(3).mfcc=m15;
figure;
fname=sprintf('c3.m4a');
[h fs8]=audioread(fname);
[h1 h2]=vad(h);
m16=mfcc(h);
m16=m16(h1-2:h2-2,:);
test(4).mfcc=m16;
disp('正在进行模板匹配...')
dist=zeros(4,4);
for o=1:4
for j=1:4
dist(o,j)=dtw(test(o).mfcc,ref(j).mfcc);
end
end
disp('正在计算匹配结果...')
for o=1:4
[d,j]=min(dist(o,:));
fprintf('测试模板%d的识别结果为:%d\n',o,j);
end
附录2:MFCC程序“mfcc.m”
function mfc=mfcc(x)
%对输入的语音序列x进行mfcc参数提取,返回mfcc参数和一阶差分mfcc参数,mel滤波器的阶数为24
%fft变换长度为256,采样频率为8000HZ,对x?256点分为一帧
bank=melbankm(24,256,8000,0,0.5,'m');
%归一化mel滤波器组参数
bank=full(bank);
bank=bank/max(bank(:));
%DCT系数,12*24
for k=1:12
n=0:23;
dctcoef(:,k)=cos((2*n+1)*k*pi/(2*24));
end
%归一化倒谱提升窗口
w=1+6*sin(pi*[1:12]./12);
w=w/max(w);
%预加重滤波器
xx=double(x);
xx=filter([1-0.9375],1,xx);
%语音信号分帧
xx=enframe(xx,256,80);
%计算每帧的mfcc参数
for i=1:size(xx,1)
y=xx(i,:);
s=y'.*hamming(256);
t=abs(fft(s));
t=t.^2;%计算能量
c1=dctcoef'*log(bank*t(1:129));%dctcoef为dct系数,bank归一化mel滤波器组系数
c2=c1.*w';%w为归一化倒谱提升窗口
m(i,:)=c2';
end
%差分系数
dtm=zeros(size(m));
for i=3:size(m,1)-2;
dtm(i,:)=-2*m(i-2,:)-m(i-1,1)+m(i+1,:)+2*m(i+2,:);
end
dtm=dtm/3;
%合并mfcc参数和一阶差分mfcc参数
mfc=[m dtm];
%去除首尾两帧,因为这两帧的一阶差分参数为0
mfc=mfc(3:size(m,1)-2,:);
附录3:VAD程序“vad.m”
function [x1,x2] = vad(x)
%幅度归一化到[-1,1]
x = double(x);
x = x / max(abs(x));
%常数设置
FrameLen = 240;
FrameInc = 80;
amp1 = 10;
amp2 = 2;
zcr1 = 10;
zcr2 = 5;
maxsilence = 8; % 8*10ms = 80ms
minlen =15; % 15*10ms = 150ms
status = 0;
count = 0;
silence = 0;
%计算过零率
tmp1 =enframe(x(1:end-1), FrameLen, FrameInc);
tmp2 =enframe(x(2:end) , FrameLen, FrameInc);
signs = (tmp1.*tmp2)<0;
diffs = (tmp1 -tmp2)>0.02;
zcr =sum(signs.*diffs, 2);
%计算短时能量
amp = sum(abs(enframe(filter([1 -0.9375], 1, x),FrameLen, FrameInc)), 2);
%调整能量门限
amp1 = min(amp1, max(amp)/4);
amp2 = min(amp2, max(amp)/8);
%开始端点检测
x1 = 0;
x2 = 0;
for n=1:length(zcr)
goto = 0;
switchstatus
case {0,1}
% 0 = 静音,
%1 = 可能开始
ifamp(n) > amp1
% 确信进入语音段
x1 =max(n-count-1,1);
status = 2;
silence = 0;
count = count + 1;
elseifamp(n) > amp2 | ... % 可能处于语音段
zcr(n) > zcr2
status = 1;
count = count + 1;
else
%静音状态
status = 0;
count = 0;
end
case2,
% 2= 语音段
ifamp(n) > amp2 | ... % 保持在语音段
zcr(n) > zcr2
count = count + 1;
else
%语音将结束
silence = silence+1;
ifsilence < maxsilence % 静音还不够长,尚未结束
count = count + 1;
elseifcount < minlen % 语音长度太短,认为是噪声
status = 0;
silence = 0;
count = 0;
else
%语音结束
status = 3;
end
end
case 3,
break;
end
end
count = count-silence/2;
x2 = x1 + count -1;
subplot(311)
plot(x)
axis([1 length(x) -1 1])
ylabel('语音信号');
line([x1*FrameInc x1*FrameInc], [-1 1], 'Color','red');
line([x2*FrameInc x2*FrameInc], [-1 1], 'Color','red');
subplot(312)
plot(amp);
axis([1 length(amp) 0 max(amp)])
ylabel('短时能量');
line([x1 x1], [min(amp),max(amp)], 'Color', 'red');
line([x2 x2], [min(amp),max(amp)], 'Color', 'red');
subplot(313)
plot(zcr);
axis([1 length(zcr) 0 max(zcr)])
ylabel('过零率');
line([x1 x1], [min(zcr),max(zcr)], 'Color', 'red');
line([x2 x2], [min(zcr),max(zcr)], 'Color', 'red');
附录4:DTW程序“dtw.m”
function dist = dtw( t,r )
n=size(t,1);
m=size(r,1);
%帧匹配距离距阵
d=zeros(n,m);
for i=1:n
for j=1:m
d(i,j)=sum((t(i,:)-r(j,:)).^2);
end
end
%累积距离矩阵
D=ones(n,m)*realmax;
D(1,1)=d(1,1);
%动态规划
for i=2:n
for j=1:m
D1=D(i-1,j);
ifj>1
D2=D(i-1,j-1);
else
D2=realmax;
end
ifj>2
D3=D(i-1,j-2);
else
D3=realmax;
end
D(i,j)=d(i,j)+min([D1,D2,D3]);
end
end
dist=D(n,m);
end
附录5:voicebox程序“melbankm.m”
function [x,mc,mn,mx]=melbankm(p,n,fs,fl,fh,w)
if nargin < 6
w='tz';
if nargin< 5
fh=0.5;
ifnargin < 4
fl=0; % min freq is DC
end
end
end
sfact=2-any(w=='s');
wr=' '; %default warping is mel
for i=1:length(w)
ifany(w(i)=='lebf');
wr=w(i);
end
end
if any(w=='h') || any(w=='H')
mflh=[flfh];
else
mflh=[flfh]*fs;
end
if ~any(w=='H')
switch wr
case 'f'
case'l'
if fl<=0
error('Low frequency limit must be >0 for l option');
end
mflh=log10(mflh);
case'e'
mflh=frq2erb(mflh);
case'b'
mflh=frq2bark(mflh);
otherwise
mflh=frq2mel(mflh);
end
end
melrng=mflh*(-1:2:1)';
fn2=floor(n/2);
if isempty(p)
p=ceil(4.6*log10(fs));
end
if any(w=='c')
if p<1
p=round(melrng/(p*1000))+1;
end
melinc=melrng/(p-1);
mflh=mflh+(-1:2:1)*melinc;
else
if p<1
p=round(melrng/(p*1000))-1;
end
melinc=melrng/(p+1);
end
switch wr
case'f'
blim=(mflh(1)+[0 1 p p+1]*melinc)*n/fs;
case 'l'
blim=10.^(mflh(1)+[0 1 p p+1]*melinc)*n/fs;
case 'e'
blim=erb2frq(mflh(1)+[0 1 p p+1]*melinc)*n/fs;
case 'b'
blim=bark2frq(mflh(1)+[0 1 p p+1]*melinc)*n/fs;
otherwise
blim=mel2frq(mflh(1)+[0 1 p p+1]*melinc)*n/fs;
end
mc=mflh(1)+(1:p)*melinc;
b1=floor(blim(1))+1;
b4=min(fn2,ceil(blim(4))-1);
switch wr
case'f'
pf=((b1:b4)*fs/n-mflh(1))/melinc;
case 'l'
pf=(log10((b1:b4)*fs/n)-mflh(1))/melinc;
case 'e'
pf=(frq2erb((b1:b4)*fs/n)-mflh(1))/melinc;
case 'b'
pf=(frq2bark((b1:b4)*fs/n)-mflh(1))/melinc;
otherwise
pf=(frq2mel((b1:b4)*fs/n)-mflh(1))/melinc;
end
if pf(1)<0
pf(1)=[];
b1=b1+1;
end
if pf(end)>=p+1
pf(end)=[];
b4=b4-1;
end
fp=floor(pf);
pm=pf-fp;
k2=find(fp>0,1);
k3=find(fp<p,1,'last');
k4=numel(fp);
if isempty(k2)
k2=k4+1;
end
if isempty(k3)
k3=0;
end
if any(w=='y')
mn=1;
mx=fn2+1;
r=[ones(1,k2+b1-1) 1+fp(k2:k3) fp(k2:k3) repmat(p,1,fn2-k3-b1+1)];
c=[1:k2+b1-1 k2+b1:k3+b1 k2+b1:k3+b1 k3+b1+1:fn2+1];
v=[ones(1,k2+b1-1) pm(k2:k3) 1-pm(k2:k3) ones(1,fn2-k3-b1+1)];
else
r=[1+fp(1:k3) fp(k2:k4)];
c=[1:k3k2:k4];
v=[pm(1:k3)1-pm(k2:k4)];
mn=b1+1;
mx=b4+1;
end
if b1<0
c=abs(c+b1-1)-b1+1;
end
% end
if any(w=='n')
v=0.5-0.5*cos(v*pi);
elseif any(w=='m')
v=0.5-0.46/1.08*cos(v*pi);
end
if sfact==2
msk=(c+mn>2) & (c+mn<n-fn2+2);
v(msk)=2*v(msk);
end
if nargout > 2
x=sparse(r,c,v);
ifnargout == 3
mc=mn;
mn=mx;
end
else
x=sparse(r,c+mn-1,v,p,1+fn2);
end
if any(w=='u')
sx=sum(x,2);
x=x./repmat(sx+(sx==0),1,size(x,2));
end
if ~nargout || any(w=='g')
ng=201;
me=mflh(1)+(0:p+1)'*melinc;
switch wr
case 'f'
fe=me;
xg=repmat(linspace(0,1,ng),p,1).*repmat(me(3:end)-me(1:end-2),1,ng)+repmat(me(1:end-2),1,ng);
case'l'
fe=10.^me;
xg=10.^(repmat(linspace(0,1,ng),p,1).*repmat(me(3:end)-me(1:end-2),1,ng)+repmat(me(1:end-2),1,ng));
case'e'
fe=erb2frq(me);
xg=erb2frq(repmat(linspace(0,1,ng),p,1).*repmat(me(3:end)-me(1:end-2),1,ng)+repmat(me(1:end-2),1,ng));
case'b'
fe=bark2frq(me);
xg=bark2frq(repmat(linspace(0,1,ng),p,1).*repmat(me(3:end)-me(1:end-2),1,ng)+repmat(me(1:end-2),1,ng));
otherwise
fe=mel2frq(me);
xg=mel2frq(repmat(linspace(0,1,ng),p,1).*repmat(me(3:end)-me(1:end-2),1,ng)+repmat(me(1:end-2),1,ng));
end
v=1-abs(linspace(-1,1,ng));
ifany(w=='n')
v=0.5-0.5*cos(v*pi);
elseifany(w=='m')
v=0.5-0.46/1.08*cos(v*pi);
end
v=v*sfact;
v=repmat(v,p,1);
ifany(w=='y')
v(1,xg(1,:)<fe(2))=sfact;
v(end,xg(end,:)>fe(p+1))=sfact;
end
ifany(w=='u')
dx=(xg(:,3:end)-xg(:,1:end-2))/2;
dx=dx(:,[1 1:ng-2 ng-2]);
vs=sum(v.*dx,2);
v=v./repmat(vs+(vs==0),1,ng)*fs/n;
end
plot(xg',v','b');
set(gca,'xlim',[fe(1) fe(end)]);
xlabel(['Frequency (' xticksi 'Hz)']);
end
附录6:辅助程序“enframe.m”
function [f,t,w]=enframe(x,win,inc,m)
nx=length(x(:)); % 取数据长度
if nargin<2 || isempty(win)
win=nx;
end
if nargin<4 || isempty(m)
m='';
end
nwin=length(win); % 取窗长
if nwin == 1 % 判断窗长是否为1,若为1,即表示没有设窗函数
lw = win;% 是,帧长=win
w =ones(1,lw);
else
lw =nwin; % 否,帧长=窗长
w =win(:)';
end
if (nargin < 3) || isempty(inc) % 如果只有两个参数,设帧inc=帧长
inc = lw;
end
nli=nx-lw+inc;
nf = fix((nli)/inc); % 计算帧数
na=nli-inc*nf;
f=zeros(nf,lw); % 初始化
indf= inc*(0:(nf-1)).'; % 设置每帧在x中的位移量位置
inds = (1:lw); % 每帧数据对应1:len
f(:) = x(indf(:,ones(1,lw))+inds(ones(nf,1),:)); % 对数据分帧
if nargin>3 && (any(m=='z') ||any(m=='r')) && na>0
ifany(m=='r')
ix=1+mod(nx-na:nx-na+lw-1,2*nx);
f(nf+1,:)=x(ix+(ix>nx).*(2*nx+1-2*ix));
else
f(nf+1,1:na)=x(1+nx-na:nx);
end
nf=size(f,1);
end
if (nwin > 1) % 若参数中包括窗函数,把每帧乘以窗函数
f = f .*w(ones(nf,1),:);
end
if nargout>1
ifany(m=='E')
t0=sum((1:lw).*w.^2)/sum(w.^2);
elseifany(m=='A')
t0=sum((1:lw).*w)/sum(w);
else
t0=(1+lw)/2;
end
t=t0+inc*(0:(nf-1)).';
end
附录7:绘制幅频图程序(DrawFFT.m)
function [ ]= DrawFFT( x, Fs )
% DrawFFT 对输入信号进行快速傅里叶变换
% 输入参数:x :输入信号; Fs:采样频率
L = length(x);
NFFT = 2^nextpow2(L); %确定FFT变换的长度
y = fft(x, NFFT)/L;
f = Fs/2*linspace(0,1,NFFT/2+1); %频率向量
plot(f, 2*abs(y(1:NFFT/2+1))); %绘制频域图像
title('幅度谱');
xlabel('频率(Hz)');
ylabel('|y(f)|');
end |