function X = textgrid2pws (fname, sr) % TEXTGRID2PWS : reads a PWS-format TextGrid file into a Matlab PWS structure % % X = textgrid2pws([fname '.TextGrid'], sr); % X = textgrid2pws([fname '.TextGrid']); % X.sr will have sample rate set to -1 % sample rate isnt used for anything else % fname does not include the '.TextGrid' extension. % % See http://people.cs.uchicago.edu/~dinoj/tonerec/ for more details if nargin==2 X.sr = sr; else X.sr = -1; end TONES = '12345'; % assumed that these are tone labels z = textread([fname '.TextGrid'],'%s','commentstyle','matlab','headerlines',6,'delimiter','\n','bufsize',4096000); b=sscanf(z{1},'size =%d'); if (~isnumeric(b) & ~length(b)) b=sscanf(z{1},'size=%d'); end if ~isnumeric(b) error(sprintf('%s.TextGrid not right PWS format',fname)); end % b should be 4 X.filename = fname; i=2; numP = ''; % number of phrases while ((i<=length(z)) & ~isnumeric(numP)) numP = sscanf(z{i},'intervals: size = %d'); i=i+1; end startphrases = []; endphrases = []; if ~isnumeric(numP) error (sprintf('%s.TextGrid not right PWS format - cant find size of first (Interval) Tier',fname)); else lastnum = 0; % last phrase number that is not -1 for n=1:numP j = i + 4*n - 3; st = sscanf(z{j},'xmin = %f'); en = sscanf(z{j+1},'xmax = %f'); num = sscanf(z{j+2},'text = "%d'); if (num > 0) if (num ~= lastnum + 1) error(sprintf('In phrase tier, phrase %d follows phrase %d in %s.TextGrid',num, lastnum, fname)); end lastnum = num; % X.phrases{num}.startphrase = st; % X.phrases{num}.endphrase = en; X.phrases{num}.filename = fname; X.phrases{num}.words = {}; X.phrases{num}.startwords = []; X.phrases{num}.endwords = []; X.phrases{num}.wordnames = {}; X.phrases{num}.validword = []; startphrases = [startphrases st]; endphrases = [endphrases en]; end end i = i + 4*numP; % z{i} should be 'item [2]:' if ~length(strfind(z{i+2},'ord')) error('Second Interval Tier should be called "word" Tier'); end %%%%%% now read second (word) tier %%%%%%%%%%%% curP = 0; % current phrase number (phrase numbering starts at 1) numW = ''; while ((i<=length(z)) & ~isnumeric(numW)) numW = sscanf(z{i},'intervals: size = %d'); i=i+1; end % z{i} should be 'intervals[1] :' now prevP = 0; curWinP = 0; % number of this word in current phrase for n=1:numW j = i + 4*(n-1) + 1; st = sscanf(z{j},'xmin = %f'); en = sscanf(z{j+1},'xmax = %f'); wd = sscanf(z{j+2},'text = "%s'); if ('"' == wd(end)) wd = wd(1:end-1); % remember wd could be '0' end if ~length(strfind(wd,'SIL')) curP = length(find(startphrases <= st)); if (curP>prevP) curWinP = 1; else curWinP = curWinP + 1; end if (curP && (endphrases(end) >= en)) L = length(X.phrases{curP}.wordnames); X.phrases{curP}.startwords(L+1) = st; X.phrases{curP}.endwords(L+1) = en; X.phrases{curP}.wordnames{L+1} = wd; if ((length(wd)==1) && strcmp(wd,'0')) X.phrases{curP}.validword(L+1) = 0; clear ws; ws.name = 'invalid'; X.phrases{curP}.words{L+1} = ws; else X.phrases{curP}.validword(L+1) = 1; clear ws; ws.name = wd; ws.numwinphrase = -1; % not known yet ws.poswinphrase = curWinP; ws.sylls = strread(wd,'%s','delimiter',TONES); if ~length(ws.sylls) [st en curP] wd error ('ws.sylls is empty'); elseif (1==length(ws.sylls{end})) ws.sylls = ws.sylls(1:end-1); end S=length(ws.sylls); p = 0; for s=1:S p = p + length(ws.sylls{s}) + 1; ws.tones(s) = str2num(wd(p)); % assuming tone labels only take one digit! ws.sylls{s} = sprintf('%s%s',ws.sylls{s}, wd(p)); end ws.sylls = ws.sylls'; ws.startsylls = zeros(1,S); ws.endsylls = zeros(1,S); ws.startsyllsPI = zeros(1,S); ws.endsyllsPI = zeros(1,S); ws.startphones = []; ws.endphones = []; ws.phones = {}; X.phrases{curP}.words{L+1} = ws; % ws is a word structure end end prevP = curP; end end % end for n=1:numW % do some checks for p=1:length(startphrases) numWinP = length(X.phrases{p}.startwords); if (startphrases(p) ~= X.phrases{p}.startwords(1)) warning('startphrases(%d) = %f isnt equal to X.phrases{%d}.startwords(1) = %f\n',p,startphrases(p),p,X.phrases{p}.startwords(1)); end if (endphrases(p) ~= X.phrases{p}.endwords(end)) warning('endphrases(%d) = %f isnt equal to X.phrases{%d}.endwords(end) = %f\n',p,endphrases(p),p,X.phrases{p}.endwords(end)); end for w=1:numWinP X.phrases{p}.words{w}.numwinphrase = length(X.phrases{p}.words); end end end i = i + 4*numW; % z{i} should be 'item [2]:' %%%%%%%%%%%%%%%%% now read third (syllabic) tier %%%%%%%%%%%%%%%%%%%%%%%% curP = 0; % current phrase number (phrase numbering starts at 1) curW = 0; % current word number (word numbering starts at 1) numS = ''; % number of syllable intervals while ((i<=length(z)) & ~isnumeric(numS)) numS = sscanf(z{i},'intervals: size = %d'); i=i+1; end % z{i} should be 'intervals[1] :' now prevP = 0; prevW = 0; curWinP = 0; % number of this word in current phrase numSinW = 0; % number of this syllable in current word for n=1:numS j = i + 4*(n-1) + 1; st = sscanf(z{j},'xmin = %f'); en = sscanf(z{j+1},'xmax = %f'); syll = sscanf(z{j+2},'text = "%s'); if ('"' == syll(end)) syll = syll(1:end-1); end % if ~length(strfind(syll,'SIL')) %% but this prevents SIL from being used btwn words of the same phrase curP_start = length(find(startphrases <= st)); % current phrase, based on start of current syllable curP_end = length(endphrases) - length(find(endphrases >= en)) + 1; curP = -1; if curP_start == curP_end curP = curP_start; end if (curP == 0) error(sprintf('start %f of syllable %s is before start of any phrase',st,syll)); curP = 1; % the start of this syllable is just before the start of the phrase... end if (curP ~= -1) curWinP = length(find(X.phrases{curP}.startwords <= st)); curSinW = []; if ~(X.phrases{curP}.validword(curWinP)) if (~strcmp(syll,'0')) warning(sprintf('Assuming that syllable %s (from %f to %f) is in an invalid word',syll,st,en)); end curSinW = 1; X.phrases{curP}.words{curWinP}.startsylls(curSinW) = st; X.phrases{curP}.words{curWinP}.endsylls(curSinW) = en; else curSinW = findcell(X.phrases{curP}.words{curWinP}.sylls, syll); if (length(curSinW) == 0) X.phrases{curP}.words{curWinP}.sylls error('cant find %s in X.phrases{%d}.words{%d}.sylls above',syll, curP, curWinP); elseif (length(curSinW) == 1) X.phrases{curP}.words{curWinP}.startsylls(curSinW) = st; X.phrases{curP}.words{curWinP}.endsylls(curSinW) = en; elseif (length(curSinW) > 1) % this can happen e.g. yiyide in 0700_0328 % e.g. yi1zhong1yi1tai2 in 0730_0043 for a = 1:length(curSinW) cswindex = 1; while (st <= X.phrases{curP}.words{curWinP}.startsylls(curSinW(cswindex))) cswindex = cswindex + 1; % it should not ever happen that cswindex > length(curSwinW) end X.phrases{curP}.words{curWinP}.startsylls(curSinW(cswindex)) = st; X.phrases{curP}.words{curWinP}.endsylls(curSinW(cswindex)) = en; end end end prevP = curP; end end %%%%%%%%%%%%%%%%% now read fourth (phonemic) tier %%%%%%%%%%%%%%%%%%%%%%%% curP = 0; % current phrase number (phrase numbering starts at 1) curW = 0; % current word number (word numbering starts at 1) curS = 0; % current syll number (syll numbering starts at 1) numN = ''; % number of phoNeme intervals while ((i<=length(z)) & ~isnumeric(numN)) numN = sscanf(z{i},'intervals: size = %d'); i=i+1; end % z{i} should be 'intervals[1] :' now curWinP = 0; % number of this word in current phrase numSinW = 0; % number of this syllable in current word curNinS = 0; % number of this phoneme in current syllable curNinW = 0; prevWinP = 0; for n=1:numN j = i + 4*(n-1) + 1; st = sscanf(z{j},'xmin = %f'); en = sscanf(z{j+1},'xmax = %f'); phn = sscanf(z{j+2},'text = "%s'); if ('"' == phn(end)) phn = phn(1:end-1); end curP_start = length(find(startphrases <= st)); % current phrase, based on start of current phoneme curP_end = length(endphrases) - length(find(endphrases >= en)) + 1; curP = -1; if (curP_start == curP_end) curP = curP_start; end if (curP == 0) % if curP_start == 0 == curP_end error(sprintf('start %f of phoneme %s is before start of any phrase',st,phn)); curP = 1; % the start of this phoneme is just before the start of the phrase... end % if curP is -1 then this phoneme is between two phrases (which is fine, no need to do anything) if (curP ~= -1) curWinP = length(find(X.phrases{curP}.startwords <= st)); if (X.phrases{curP}.validword(curWinP)) if (prevWinP == curWinP) curNinW = curNinW + 1; else curNinW = 1; end X.phrases{curP}.words{curWinP}.startphones(curNinW) = st; X.phrases{curP}.words{curWinP}.endphones(curNinW) = en; X.phrases{curP}.words{curWinP}.phones{curNinW} = phn; end else curWinP = curWinP+1; end prevWinP = curWinP; end for i=1:length(X.phrases) for j=1:length(X.phrases{i}.words) if (X.phrases{i}.validword(j)) S = length(X.phrases{i}.words{j}.startsylls); for s=1:S % [i j s] % X.phrases{i}.words{j}.phones % X.phrases{i}.words{j}.startphones % X.phrases{i}.words{j}.startsylls X.phrases{i}.words{j}.startsyllsPI(s) = find(X.phrases{i}.words{j}.startphones == X.phrases{i}.words{j}.startsylls(s)); X.phrases{i}.words{j}.endsyllsPI(s) = find(X.phrases{i}.words{j}.endphones == X.phrases{i}.words{j}.endsylls(s)); end end end end