function [predictor,response,segmentsPerFile] = vggishPreprocess(ads,overlap) % This function is for example purposes only and may be changed or removed % in a future release. % Create filter bank FFTLength = 512; numBands = 64; fs0 = 16e3; filterBank = designAuditoryFilterBank(fs0, ... 'FrequencyScale','mel', ... 'FFTLength',FFTLength, ... 'FrequencyRange',[125 7500], ... 'NumBands',numBands, ... 'Normalization','none', ... 'FilterBankDesignDomain','warped'); % Define STFT parameters windowLength = 0.025 * fs0; hopLength = 0.01 * fs0; win = hann(windowLength,'periodic'); % Define spectrogram segmentation parameters segmentDuration = 0.96; % seconds segmentRate = 100; % hertz segmentLength = segmentDuration*segmentRate; % Number of spectrums per auditory spectrograms segmentHopDuration = (100-overlap) * segmentDuration / 100; % Duration (s) advanced between auditory spectrograms segmentHopLength = round(segmentHopDuration * segmentRate); % Number of spectrums advanced between auditory spectrograms % Preallocate cell arrays for the predictors and responses numFiles = numel(ads.Files) predictor = cell(numFiles,1); response = predictor; segmentsPerFile = zeros(numFiles,1); % Extract predictors and responses for each file for ii = 1:numFiles try [audioIn,info] = read(ads); [m, n] = size(audioIn); %gives dimensions of array where n is the number of stereo channels if n == 2 audioIny = audioIn(:, 1) + audioIn(:, 2); %sum(y, 2) also accomplishes this peakAmp = max(abs(audioIny)); audioIny = audioIny/peakAmp; % check the L/R channels for orig. peak Amplitudes peakL = max(abs(audioIn(:, 1))); peakR = max(abs(audioIn(:, 2))); maxPeak = max([peakL peakR]); %apply x's original peak amplitude to the normalized mono mixdown audioIny = audioIny*maxPeak; else audioIny = audioIn; %it is stereo so we will return it as is (e.g., for additional processing) end audioIn=audioIny; x = single(resample(audioIn,fs0,info.SampleRate)); Y = stft(x, ... 'Window',win, ... 'OverlapLength',windowLength-hopLength, ... 'FFTLength',FFTLength, ... 'FrequencyRange','onesided'); Y = abs(Y); % filterBankMultiplicationL=filterBank*Y(:,:,1); % filterBankMultiplicationR=filterBank*Y(:,:,2); % filterBankMultiplication(:,:,1)=filterBankMultiplicationL; % filterBankMultiplication(:,:,2)=filterBankMultiplicationR; % logMelSpectrogram = log(filterBankMultiplication + single(0.01)); % logMelSpectrogram =permute(logMelSpectrogram, [2 1 3]); logMelSpectrogram = log(filterBank*Y + single(0.01))'; % Segment log-mel spectrogram numHops = floor((size(Y,2)-segmentLength)/segmentHopLength) + 1; segmentedLogMelSpectrogram = zeros(segmentLength,numBands,1,numHops); for hop = 1:numHops segmentedLogMelSpectrogram(:,:,1,hop) = logMelSpectrogram(1+segmentHopLength*(hop-1):segmentLength+segmentHopLength*(hop-1),:); end predictor{ii} = segmentedLogMelSpectrogram; response{ii} = repelem(info.Label,numHops); segmentsPerFile(ii) = numHops; catch fprintf('loop number %d failed\n',ii) end end % Concatenate predictors and responses into arrays predictor = cat(4,predictor{:}); response = cat(2,response{:}); end